External data

# Table of contents -------------------------------------------------------
# 1. vdemdata CHECK
# 2. Rainbowmap CHECK
# 3. Economist Democracy Scores for 2018 CHECK
# 4. Happiness CHECK
# 5. GDP per capita CHECK
# 6. ESS round 9 CHECK
# 7. Unemployment rate CHECK
# 8. Gender equality index CHECK  


## further ideas:
# which political parties governed in the years before (i.e., left, centre-right etc.)
# Gini index
# migrant acceptance scores
# welfare state/ size of government relative to GDP
# physical/ mental health issues; quality and equity of healthcare
# solidarity with minority groups
# education


# load libraries ----------------------------------------------------------
# devtools::install_github("vdeminstitute/vdemdata")

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readr)
library(readxl)
library(vdemdata)
library(survey) 
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loading required package: survival
## 
## Attaching package: 'survey'
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(countrycode)
library(rvest)
## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(stringr)
library(mice)       
## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(VIM)
## Loading required package: colorspace
## VIM is ready to use.
## 
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## 
## The following object is masked from 'package:datasets':
## 
##     sleep
library(visdat)    
library(naniar)     
library(caret)      
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:survival':
## 
##     cluster
## 
## The following object is masked from 'package:purrr':
## 
##     lift
# mapping -----------------------------------------------------------------
# we comment out the country codes as we likely don't need them
survey_country_mapping <- data.frame(
  country_name = c("France", "Belgium", "Netherlands", "Germany", "Italy", 
                   "Luxembourg", "Denmark", "Ireland", "United Kingdom", "Greece", 
                   "Spain", "Portugal", "Finland", "Sweden", 
                   "Austria", "Cyprus", "Czech Republic", "Estonia", "Hungary", 
                   "Latvia", "Lithuania", "Malta", "Poland", "Slovakia", 
                   "Slovenia", "Bulgaria", "Romania", "Croatia"),
  #survey_country_code = c(1, 2, 3, 4, 5, 
  #                        6, 7, 8, 9, 11, 
  #                        12, 13, 16, 17, 
  #                        18, 19, 20, 21, 22, 
  #                        23, 24, 25, 26, 27, 
  #                        28, 29, 30, 32),
  iso2 = c("FR", "BE", "NL", "DE", "IT", 
           "LU", "DK", "IE", "GB", "GR", 
           "ES", "PT", "FI", "SE", 
           "AT", "CY", "CZ", "EE", "HU", 
           "LV", "LT", "MT", "PL", "SK", 
           "SI", "BG", "RO", "HR"),
  stringsAsFactors = FALSE)

# create a mapping for country name standardization: the Czech Republic is a problem
country_name_mapping <- c("Czechia" = "Czech Republic")


# 1. vdemdata ----------------------------------------------------------------
# install the package from GitHub first:
# devtools::install_github("vdeminstitute/vdemdata")
vdem_data <- vdem

# apply country name standardization before filtering
vdem_data_standardized <- vdem_data %>%
  mutate(
    # Standardize country names
    country_name = ifelse(country_name %in% names(country_name_mapping),
                          country_name_mapping[country_name],
                          country_name))

# filter for most recent data (2019 to match survey data)
vdem_2019 <- vdem_data_standardized %>%
  filter(country_name %in% survey_country_mapping$country_name, year == 2019) %>%
  select(country_name, v2x_libdem, v2x_polyarchy, v2x_gender, 
         v2x_egaldem, v2x_liberal, v2xcs_ccsi, v2x_freexp)  # select relevant variables

saveRDS(vdem_2019, file = "vdem_eu_2019.rds")
write.csv(vdem_2019, "vdem_eu_2019.csv")

## from the codebook: 
# v2x_libdem: index of liberal democracy
# v2x_polyarchy: index of electoral democracy
# v2x_gender: index of women's political empowerment
# v2x_egaldem: index of egalitarian democracy   
# v2x_liberal: index of civil liberties
# v2xcs_ccsi:  index of civil society participation
# v2x_freexp: index of freedom of expression



# 2. Rainbowmap --------------------------------------------------------------
# https://rainbowmap.ilga-europe.org/

# rainbow_data <- read_csv("data/raw/2024-rainbow-map-data.csv")

# but the problem: that's for 2024, not 2019 or before 2019
# hence, we would need to get the data for 2019 and years before:

# create data frames for each year
df_2019 <- data.frame(
  Country = c("Malta", "Belgium", "United Kingdom", "Norway", "France", "Finland",
              "Denmark", "Spain", "Portugal", "Sweden", "Netherlands", "Austria",
              "Germany", "Croatia", "Greece", "Ireland", "Hungary", "Luxembourg",
              "Iceland", "Slovenia", "Montenegro", "Estonia", "Switzerland", "Georgia",
              "Bosnia & Herzegovina", "Slovakia", "Albania", "Serbia", "Bulgaria", 
              "Czechia", "Kosovo", "Andorra", "Cyprus", "Romania", "Ukraine", 
              "Lithuania", "Italy", "Poland", "Latvia", "Belarus", "Moldova", 
              "Russia", "North Macedonia", "Liechtenstein", "San Marino", "Armenia",
              "Turkey", "Monaco", "Azerbaijan"),
  Value = c(74.72, 70.40, 67.62, 63.64, 62.73, 62.20, 60.31, 58.49, 57.50, 52.86,
            49.72, 48.54, 47.45, 45.83, 44.82, 44.72, 42.83, 41.34, 40.20, 39.82, 
            37.16, 34.38, 30.06, 29.49, 29.37, 29.04, 27.74, 26.43, 25.75, 25.50,
            25.34, 23.93, 22.03, 20.67, 19.97, 19.91, 19.43, 18.10, 16.83, 15.82,
            15.10, 11.75, 10.88, 10.08, 9.87, 8.50, 8.50, 7.31, 5.67),
  Year = 2019)

df_2018 <- data.frame(
  Country = c("Malta", "Belgium", "United Kingdom", "Finland", "France", "Norway",
              "Portugal", "Denmark", "Spain", "Sweden", "Netherlands", "Germany",
              "Austria", "Greece", "Ireland", "Croatia", "Slovenia", "Luxembourg",
              "Iceland", "Hungary", "Estonia", "Switzerland", "Montenegro", "Andorra",
              "Albania", "Kosovo", "Bosnia & Herzegovina", "Serbia", "Czechia", 
              "Cyprus", "Slovakia", "Italy", "Georgia", "Bulgaria", "Lithuania",
              "Romania", "Ukraine", "Poland", "Liechtenstein", "Latvia", 
              "North Macedonia", "Belarus", "Moldova", "San Marino", "Russia", 
              "Monaco", "Turkey", "Armenia", "Azerbaijan"),
  Value = c(91.94, 78.70, 73.48, 73.27, 72.81, 72.74, 69.16, 67.69, 67.03, 60.10,
            59.64, 59.00, 56.40, 52.32, 52.22, 50.58, 47.73, 47.48, 47.22, 47.16,
            39.34, 38.44, 37.74, 34.81, 33.24, 32.98, 31.38, 29.68, 29.20, 28.95,
            28.65, 28.82, 25.87, 24.15, 21.78, 21.12, 20.95, 18.23, 17.87, 16.07,
            14.03, 13.35, 13.08, 12.32, 10.80, 9.78, 8.60, 7.20, 4.70),
  Year = 2018)

df_2017 <- data.frame(
  Country = c("Malta", "Norway", "United Kingdom", "Belgium", "France", "Portugal",
              "Finland", "Denmark", "Spain", "Netherlands", "Croatia", "Sweden", 
              "Austria", "Germany", "Ireland", "Iceland", "Greece", "Luxembourg", 
              "Hungary", "Slovenia", "Montenegro", "Andorra", "Estonia", "Albania",
              "Bosnia & Herzegovina", "Switzerland", "Kosovo", "Serbia", "Czechia", 
              "Cyprus", "Slovakia", "Italy", "Georgia", "Bulgaria", "Romania", 
              "Ukraine", "Poland", "Liechtenstein", "Lithuania", "Latvia", 
              "North Macedonia", "Belarus", "Moldova", "San Marino", "Monaco", 
              "Turkey", "Armenia", "Azerbaijan"),
  Value = c(77.74, 75.73, 71.86, 70.82, 69.16, 68.27, 67.69, 67.03, 64.44, 62.36,
            60.10, 55.58, 54.41, 52.22, 47.22, 46.92, 46.48, 44.82, 44.28, 38.64, 
            34.81, 33.31, 33.24, 31.34, 30.94, 30.48, 29.68, 29.20, 28.95, 27.60, 
            26.67, 25.87, 23.15, 21.12, 19.00, 18.23, 17.87, 17.28, 17.12, 16.03, 
            13.35, 13.08, 12.32, 9.78, 8.60, 7.20, 6.40, 4.70),
  Year = 2017)

df_2016 <- data.frame(
  Country = c("Malta", "Belgium", "United Kingdom", "Spain", "Denmark", "Portugal",
              "Finland", "France", "Croatia", "Netherlands", "Norway", "Sweden", 
              "Austria", "Iceland", "Greece", "Germany", "Ireland", "Hungary", 
              "Luxembourg", "Montenegro", "Estonia", "Albania", "Switzerland", 
              "Andorra", "Serbia", "Cyprus", "Slovenia", "Czechia", "Kosovo", 
              "Georgia", "Bosnia & Herzegovina", "Slovakia", "Bulgaria", "Romania",
              "Italy", "Poland", "Liechtenstein", "Lithuania", "Latvia", 
              "North Macedonia", "San Marino", "Moldova", "Belarus", "Monaco", 
              "Turkey", "Armenia", "Azerbaijan"),
  Value = c(77.75, 81.85, 79.19, 70.95, 70.90, 69.55, 67.25, 66.60, 66.55, 66.10, 
            65.15, 64.85, 62.21, 59.00, 58.30, 55.14, 54.70, 51.40, 50.35, 45.20, 
            38.25, 34.40, 33.15, 32.10, 32.00, 31.95, 31.65, 31.60, 31.55, 30.35, 
            29.40, 29.20, 24.00, 23.45, 19.75, 18.30, 18.20, 18.10, 17.35, 15.55, 
            14.40, 14.15, 13.35, 10.80, 8.75, 7.20, 4.85),
  Year = 2016)

df_2015 <- data.frame(
  Country = c("United Kingdom", "Belgium", "Malta", "Sweden", "Croatia", "Netherlands",
              "Norway", "Spain", "Denmark", "Portugal", "France", "Iceland", "Finland", 
              "Germany", "Austria", "Hungary", "Montenegro", "Luxembourg", "Albania",
              "Ireland", "Greece", "Georgia", "Czechia", "Estonia", "Slovenia",
              "Andorra", "Bosnia & Herzegovina", "Serbia", "Slovakia", "Romania",
              "Switzerland", "Bulgaria", "Poland", "Italy", "Liechtenstein", 
              "Lithuania", "Cyprus", "Kosovo", "Latvia", "Moldova", "Belarus",
              "San Marino", "North Macedonia", "Turkey", "Monaco", "Armenia", 
              "Azerbaijan"),
  Value = c(88.00, 83.00, 77.00, 72.00, 71.00, 69.00, 69.00, 69.00, 68.00, 67.00,
            65.00, 63.00, 62.00, 56.00, 52.00, 50.00, 46.00, 43.00, 42.00, 40.00,
            39.00, 36.00, 35.00, 34.00, 32.00, 31.00, 29.00, 29.00, 29.00, 28.00, 
            28.00, 27.00, 26.00, 22.00, 19.00, 19.00, 18.00, 18.00, 18.00, 16.00,
            14.00, 14.00, 13.00, 12.00, 11.00, 9.00, 5.00),
  Year = 2015)

df_2014 <- data.frame(
  Country = c("United Kingdom", "Belgium", "Spain", "Netherlands", "Norway", 
              "Portugal", "Sweden", "France", "Iceland", "Denmark", "Malta", 
              "Croatia", "Germany", "Hungary", "Austria", "Montenegro", "Finland",
              "Albania", "Slovenia", "Czechia", "Estonia", "Ireland", "Greece", 
              "Slovakia", "Serbia", "Bulgaria", "Switzerland", "Luxembourg", 
              "Romania", "Poland", "Italy", "Georgia", "Lithuania", "Andorra",
              "Bosnia & Herzegovina", "Cyprus", "Latvia", "Liechtenstein", "Kosovo",
              "Moldova", "Turkey", "San Marino", "Belarus", "North Macedonia", 
              "Ukraine", "Monaco", "Armenia", "Azerbaijan"),
  Value = c(80.25, 78.10, 73.26, 69.90, 68.40, 66.60, 65.30, 64.10, 63.95, 59.90,
            56.80, 56.30, 55.68, 53.65, 52.10, 47.05, 45.30, 38.40, 35.00, 34.65, 
            34.65, 33.65, 31.15, 30.50, 30.30, 30.00, 28.85, 28.35, 27.95, 27.65, 
            27.40, 28.05, 21.70, 20.60, 20.10, 19.65, 19.65, 18.00, 17.10, 16.50,
            14.15, 13.70, 13.60, 13.30, 11.65, 10.10, 8.50, 6.60),
  Year = 2014)

df_2013 <- data.frame(
  Country = c("United Kingdom", "Belgium", "Norway", "Sweden", "Spain", "Portugal", 
              "France", "Netherlands", "Denmark", "Iceland", "Hungary", "Germany",
              "Croatia", "Finland", "Austria", "Albania", "Malta", "Slovenia", 
              "Czechia", "Ireland", "Romania", "Estonia", "Switzerland", "Luxembourg",
              "Greece", "Slovakia", "Montenegro", "Serbia", "Poland", "Georgia",
              "Lithuania", "Andorra", "Bosnia & Herzegovina", "Cyprus", "Latvia", 
              "Italy", "Bulgaria", "Liechtenstein", "Turkey", "San Marino", "Belarus",
              "Kosovo", "North Macedonia", "Ukraine", "Monaco", "Armenia", "Azerbaijan"),
  Value = c(78.50, 68.73, 65.65, 65.30, 65.04, 64.60, 64.10, 60.00, 59.80, 55.50,
            54.70, 54.29, 48.30, 47.25, 43.35, 38.40, 35.30, 35.00, 34.65, 33.65, 
            31.30, 28.90, 28.85, 28.35, 28.10, 28.90, 28.65, 25.05, 21.65, 21.05, 
            20.70, 20.60, 19.95, 19.65, 19.65, 19.40, 18.00, 15.50, 14.15, 13.70, 
            13.60, 13.50, 13.30, 11.65, 10.10, 7.50, 7.10),
  Year = 2013)

# combine all data frames into one
df_combined <- bind_rows(df_2019, df_2018, df_2017, df_2016, df_2015, df_2014, df_2013)

# create new, compressed df
# step 1: Filter data for 2019 and 2018
df_2019 <- df_combined %>% filter(Year == 2019) %>% rename(Value_2019 = Value)
df_2018 <- df_combined %>% filter(Year == 2018) %>% rename(Value_2018 = Value)

# step 2: Filter data for 2013 and 2014 and calculate the average
df_2013_2014 <- df_combined %>% filter(Year %in% c(2013, 2014)) %>%
  group_by(Country) %>%
  summarise(Avg_2013_2014 = mean(Value, na.rm = TRUE))

# step 3: Join the data frames for 2019 and 2018
df_compressed <- df_2019 %>%
  left_join(df_2018, by = "Country") %>%
  select(Country, Value_2019, Value_2018)

# step 4: Calculate the average for 2019 and 2018
df_compressed <- df_compressed %>%
  mutate(Avg_2019_2018 = (Value_2019 + Value_2018) / 2)

# step 5: Join the average for 2013 and 2014
df_compressed <- df_compressed %>%
  left_join(df_2013_2014, by = "Country")

# step 6: Calculate the difference between the averages
df_compressed <- df_compressed %>%
  mutate(Difference = Avg_2019_2018 - Avg_2013_2014)

# step 7: Select and reorder columns for the final compressed data frame
df_compressed <- df_compressed %>%
  select(Country, Value_2019, Value_2018, Avg_2019_2018, 
         Avg_2013_2014, Difference)

rainbow_df <- df_compressed

# save the data frame
saveRDS(rainbow_df, file = "rainbow_df.rds")
write.csv(rainbow_df, "rainbow_df.csv")



# 3. The Economist: Democracy scores 2018 ---------------------------------
# https://enperspectiva.uy/wp-content/uploads/2019/01/Democracy_Index_2018.pdf
democracy_scores <- data.frame(
  Country = c("Belgium", "Denmark", "Greece", "Spain", "Finland", "France", "Ireland", "Italy", "Luxembourg", "Netherlands", "Austria", "Portugal", "Sweden", "Germany", "United Kingdom", "Bulgaria", "Cyprus", "Czech Republic", "Estonia", "Hungary", "Latvia", "Lithuania", "Malta", "Poland", "Romania", "Slovakia", "Slovenia", "Croatia"),
  ISO2 = c("BE", "DK", "GR", "ES", "FI", "FR", "IE", "IT", "LU", "NL", "AT", "PT", "SE", "DE", "GB", "BG", "CY", "CZ", "EE", "HU", "LV", "LT", "MT", "PL", "RO", "SK", "SI", "HR"),
  Overall_score = c(7.78, 9.22, 7.29, 8.08, 9.14, 7.80, 9.15, 7.71, 8.81, 8.89, 8.29, 7.84, 9.39, 8.68, 8.53, 7.03, 7.59, 7.69, 7.97, 6.63, 7.38, 7.50, 8.21, 6.67, 6.38, 7.10, 7.50, 6.57),
  #Global_rank = c(31, 5, 39, 19, 8, 29, "6=", 33, 12, 11, 16, 27, 3, 13, 14, 46, 35, 34, "23=", "57=", 38, "36=", 18, "54=", "66=", 44, "36=", 60),
  #Regional_rank = c(17, 4, 20, 14, 6, 16, 5, 18, 9, 8, 12, 15, 3, 10, 11, 7, 19, 2, 1, 9, 5, "3=", 13, 8, 12, 6, "3=", 10),
  Electoral_process_and_pluralism = c(9.58, 10.00, 9.58, 9.17, 10.00, 9.58, 9.58, 9.58, 10.00, 9.58, 9.58, 9.58, 9.58, 9.58, 9.58, 9.17, 9.17, 9.58, 9.58, 8.75, 9.58, 9.58, 9.17, 9.17, 9.17, 9.58, 9.58, 9.17),
  Functioning_of_government = c(8.93, 9.29, 5.36, 7.14, 8.93, 7.50, 7.86, 6.07, 8.93, 9.29, 7.86, 7.50, 9.64, 8.57, 7.50, 6.43, 6.43, 6.79, 8.21, 6.07, 6.07, 6.43, 8.21, 6.07, 5.71, 6.79, 6.79, 6.07),
  Political_participation = c(5.00, 8.33, 6.11, 7.78, 8.33, 7.78, 8.33, 7.78, 6.67, 8.33, 8.33, 6.11, 8.33, 8.33, 8.33, 7.22, 6.67, 6.67, 6.67, 5.00, 5.56, 6.11, 6.11, 6.11, 5.00, 5.56, 6.67, 5.56),
  Political_culture = c(6.88, 9.38, 6.88, 7.50, 8.75, 5.63, 10.00, 6.88, 8.75, 8.13, 6.88, 6.88, 10.00, 7.50, 8.13, 4.38, 6.88, 6.88, 6.88, 6.25, 6.88, 6.25, 8.75, 4.38, 4.38, 5.63, 6.25, 5.00),
  Civil_liberties = c(8.53, 9.12, 8.53, 8.82, 9.71, 8.53, 10.00, 8.24, 9.71, 9.12, 8.82, 9.12, 9.41, 9.41, 9.12, 7.94, 8.82, 8.53, 8.53, 7.06, 8.82, 9.12, 8.82, 7.65, 7.65, 7.94, 8.24, 7.06),
  Regime_type = c("Flawed democracy", "Full democracy", "Flawed democracy", "Full democracy", "Full democracy", "Flawed democracy", "Full democracy", "Flawed democracy", "Full democracy", "Full democracy", "Full democracy", "Flawed democracy", "Full democracy", "Full democracy", "Full democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Full democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy"))

saveRDS(democracy_scores, file = "democracy_scores.rds")
write.csv(democracy_scores, "democracy_scores.csv")


# 4. Happiness data 2018 ---------------------------------------------------------------------
# https://s3.amazonaws.com/happiness-report/2018/WHR_web.pdf
happiness_scores <- data.frame(
  Country = c("Finland", "Denmark", "Greece", "Spain", "France", "Ireland", "Italy", "Luxembourg", "Netherlands", "Austria", "Portugal", "Sweden", "Germany", "United Kingdom", "Bulgaria", "Cyprus", "Czech Republic", "Estonia", "Hungary", "Latvia", "Lithuania", "Malta", "Poland", "Romania", "Slovakia", "Slovenia", "Croatia"),
  ISO2 = c("FI", "DK", "GR", "ES", "FR", "IE", "IT", "LU", "NL", "AT", "PT", "SE", "DE", "GB", "BG", "CY", "CZ", "EE", "HU", "LV", "LT", "MT", "PL", "RO", "SK", "SI", "HR"),
  Happiness_Score = c(7.632, 7.555, 5.358, 6.310, 6.489, 6.977, 6.000, 6.910, 7.441, 7.139, 5.410, 7.314, 6.965, 6.814, 4.933, 5.762, 6.711, 5.739, 5.620, 5.933, 5.952, 6.627, 6.123, 5.945, 6.173, 5.948, 5.321))

saveRDS(happiness_scores, file = "happiness_scores.rds")
write.csv(happiness_scores, "happiness_scores.csv")


# 5. GDP per capita ---------------------------------------------------------------------
df_GDP <- read_csv("data/raw/data_20250228194704.csv")
## New names:
## Rows: 1064 Columns: 10
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): IndicatorName, VariableName, MeasurementName, CountryCode, Alpha3Co... dbl
## (2): IndicatorCode, PeriodCode lgl (1): ...10
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...10`
# apply the mapping to the CountryName column
df_GDP <- df_GDP %>%
  mutate(
    # replace Czechia with Czech Republic
    CountryName = ifelse(CountryName %in% names(country_name_mapping), 
                         country_name_mapping[CountryName], 
                         CountryName)) %>%
  select("CountryName", "PeriodCode", "Value") %>%
  # now filter after standardizing the names
  filter(CountryName %in% survey_country_mapping$country_name)

df_GDP <- df_GDP %>%
  mutate(Value = as.numeric(as.character(Value)))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Value = as.numeric(as.character(Value))`.
## Caused by warning:
## ! NAs introduced by coercion
df_GDP <- df_GDP %>%
  # group by country
  group_by(CountryName) %>% 
  # find first and last year values
  summarize(
    gdp_2005 = Value[PeriodCode == 2005],
    gdp_2018 = Value[PeriodCode == 2018],
    # calculate relative growth
    gdp_growth = (gdp_2018 - gdp_2005) / gdp_2005 * 100) %>%
  # add ISO2 codes for easier joining with other datasets
  left_join(survey_country_mapping, by = c("CountryName" = "country_name")) %>%
  # select relevant columns
  select(CountryName, iso2, gdp_2005, gdp_2018, gdp_growth)

saveRDS(df_GDP, file = "df_GDP.rds")
write.csv(df_GDP, "df_GDP.csv")


# 6. ESS Round 9 ----------------------------------------------------------
# https://ess.sikt.no/en/datafile/b2b0bf39-176b-4eca-8d26-3c05ea83d2cb
ess_data <- read_csv("data/raw/ESS9e03_2.csv")
## Rows: 49519 Columns: 572
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): name, proddate, cntry, ctzshipd, cntbrthd, lnghom1, lnghom2, fbrn...
## dbl (562): essround, edition, idno, dweight, pspwght, pweight, anweight, nws...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# select interesting variables, country and weight variables
ess_selected <- ess_data %>%
  select(
    # identifiers and weights
    cntry,          # country code
    pspwght,        # post-stratification weight
    dweight,        # design weight
    
    # key variables of interest
    freehms,        # gays and lesbians free to live life as they wish
    lrscale,        # left-right political scale
    rlgdgr,         # how religious are you
    ipeqopt,        # important that people are treated equally
    atchctr,        # attachment to country
    eduyrs,         # years of education
    agea)            # age of respondent

# function to recode ESS special values (negative values are typically missing values)
recode_ess_missing <- function(x) {
  ifelse(x < 0, NA, x)
}

# clean the data
ess_clean <- ess_selected %>%
  # recode special values to NA
  mutate(across(c(freehms, lrscale, rlgdgr, ipeqopt, atchctr, eduyrs, agea), 
                recode_ess_missing)) %>%
  # create derived variables if needed
  mutate(
    # recode freehms to 0-1 scale (originally 1-5 where 1 = agree strongly)
    freehms_support = case_when(
      freehms %in% c(1, 2) ~ 1,  # agree and strongly agree
      freehms %in% c(3, 4, 5) ~ 0,  # neutral, disagree, strongly disagree
      TRUE ~ NA_real_),
    
    # create age groups
    age_group = case_when(
      agea < 35 ~ "18-34",
      agea < 55 ~ "35-54",
      TRUE ~ "55+"
    ),
    
    # standardise left-right scale to 0-1
    lrscale_std = (lrscale - 1) / 9,  # Original scale is 1-10
    
    # create high education indicator (above country median)
    high_educ = NA  # will fill this in after calculating country medians
  )

# calculate country median education for relative education measure
country_medians <- ess_clean %>%
  group_by(cntry) %>%
  summarize(median_educ = median(eduyrs, na.rm = TRUE))

# join back to main data and create high education indicator
ess_clean <- ess_clean %>%
  left_join(country_medians, by = "cntry") %>%
  mutate(high_educ = ifelse(eduyrs > median_educ, 1, 0))

# calculate weighted means by country
country_aggregates <- ess_clean %>%
  # group by country
  group_by(cntry) %>%
  # calculate weighted statistics
  summarize(
    # sample size
    n_respondents = n(),
    n_valid = sum(!is.na(freehms)),
    
    # weighted means
    pct_lgbt_support = weighted.mean(freehms_support, w = pspwght, na.rm = TRUE) * 100,
    mean_religiosity = weighted.mean(rlgdgr, w = pspwght, na.rm = TRUE),
    mean_left_right = weighted.mean(lrscale_std, w = pspwght, na.rm = TRUE),
    mean_equal_values = weighted.mean(ipeqopt, w = pspwght, na.rm = TRUE),
    mean_country_attach = weighted.mean(atchctr, w = pspwght, na.rm = TRUE),
    mean_eduyrs = weighted.mean(eduyrs, w = pspwght, na.rm = TRUE),
    mean_age = weighted.mean(agea, w = pspwght, na.rm = TRUE),
    
    # weighted proportions for categorical variables
    pct_young = weighted.mean(age_group == "18-34", w = pspwght, na.rm = TRUE) * 100,
    pct_high_educ = weighted.mean(high_educ, w = pspwght, na.rm = TRUE) * 100,
    
    # standard errors (for confidence intervals)
    se_lgbt_support = sd(freehms_support, na.rm = TRUE) / sqrt(sum(!is.na(freehms_support))),
    
    # missing data proportions
    pct_missing_lgbt = mean(is.na(freehms)) * 100)

# calculate cross-variable country indicators
country_indicators <- ess_clean %>%
  group_by(cntry) %>%
  summarize(
    # correlation between age and LGBT support within country
    age_lgbt_corr = cor(agea, freehms_support, use = "pairwise.complete.obs", method = "spearman"),
    
    # correlation between religiosity and LGBT support
    relig_lgbt_corr = cor(rlgdgr, freehms_support, use = "pairwise.complete.obs", method = "spearman"),
    
    # inequality in LGBT support (standard deviation)
    lgbt_support_inequality = sd(freehms_support, na.rm = TRUE),
    
    # education gradient in LGBT support (difference between high and low education)
    educ_gradient = weighted.mean(freehms_support[high_educ == 1], w = pspwght[high_educ == 1], na.rm = TRUE) - 
      weighted.mean(freehms_support[high_educ == 0], w = pspwght[high_educ == 0], na.rm = TRUE))

# join the aggregates and indicators
country_data_final <- country_aggregates %>%
  left_join(country_indicators, by = "cntry") %>%
  # create ISO country codes for easier merging with other datasets
  mutate(
    iso2c = countrycode(cntry, "iso2c", "iso2c"),
    iso3c = countrycode(cntry, "iso2c", "iso3c"))

# plot LGBT support by country
ggplot(country_data_final, aes(x = reorder(cntry, pct_lgbt_support), y = pct_lgbt_support)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_errorbar(aes(ymin = pct_lgbt_support - 1.96*se_lgbt_support, 
                    ymax = pct_lgbt_support + 1.96*se_lgbt_support), 
                width = 0.2) +
  labs(title = "Support for LGBT Rights by Country",
       subtitle = "Percent agreeing gays and lesbians should be free to live as they wish",
       x = "Country",
       y = "Support (%)") +
  theme_minimal() +
  coord_flip()

# examine relationship between religiosity and LGBT support
ggplot(country_data_final, aes(x = mean_religiosity, y = pct_lgbt_support, label = cntry)) +
  geom_point(size = 3, alpha = 0.7) +
  geom_text(hjust = -0.3, vjust = 0.3) +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "Religiosity vs. LGBT Support by Country",
       x = "Mean Religiosity Score",
       y = "LGBT Support (%)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

saveRDS(country_data_final, file = "country_data_final.rds")
write.csv(country_data_final, "country_data_final.csv")



# 7. Unemployment rate ----------------------------------------------------
# https://en.wikipedia.org/wiki/List_of_European_Union_member_states_by_unemployment_rate

# scrape data from Wikipedia
url <- "https://en.wikipedia.org/wiki/List_of_European_Union_member_states_by_unemployment_rate"
page <- read_html(url)
tables <- html_table(page, fill = TRUE)
eu_unemployment_table <- tables[[1]]

# clean column names - simplify them to more standard names
colnames(eu_unemployment_table) <- c("Country", "Unemployment", "Employment", "Year")

# clean up the country names by removing footnote references
eu_unemployment_table$Country <- gsub("\\[.*?\\]", "", eu_unemployment_table$Country)

# make sure numeric columns are properly formatted
eu_unemployment_table$Unemployment <- as.character(eu_unemployment_table$Unemployment)
eu_unemployment_table$Employment <- as.numeric(as.character(eu_unemployment_table$Employment))
eu_unemployment_table$Year <- as.numeric(as.character(eu_unemployment_table$Year))

eu_unemployment_table <- eu_unemployment_table %>%
  mutate(
    # trim spaces from country names
    Country = trimws(Country),
    # convert Unemployment to numeric (remove any % signs or spaces if present)
    Unemployment = as.numeric(gsub("[^0-9.]", "", Unemployment)))

# modify Greece's unemployment rate to the 2018 value
eu_unemployment_table$Unemployment[eu_unemployment_table$Country == "Greece"] <- 19.18
eu_unemployment_table$Year[eu_unemployment_table$Country == "Greece"] <- 2018

# add the UK data manually
uk_data <- data.frame(
  Country = "United Kingdom",
  Unemployment = 4.12, # from Statista
  Employment = 75.25,  # estimated from https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/employmentandemployeetypes/articles/singlemonthlabourforcesurveyestimates/december2018
  Year = 2018)

# append UK data to the table
eu_unemployment_table <- rbind(eu_unemployment_table, uk_data)

# ignore the year column
eu_unemployment_table <- eu_unemployment_table %>%
  select(-Year)

# save
saveRDS(eu_unemployment_table, file = "eu_unemployment_table.rds")
write.csv(eu_unemployment_table, "eu_unemployment_table.csv", row.names = FALSE)


# 8. Gender equality ------------------------------------------------------
# https://eige.europa.eu/gender-statistics/dgs/indicator/index__index_scores/datatable?time=2017&col=domain&row=geo
gender_eq_index <- read_xlsx("data/raw/index__index_scores.xlsx", range = "A16:V44")

gender_eq_index <- gender_eq_index %>%
  select(country_name = "Geographic region\\(Sub-) Domain Scores", 
         gender_equality_index = "Overall Gender Equality Index") %>%
  mutate(country_name = ifelse(country_name == "Czechia", "Czech Republic", country_name))

# Merge the data ----------------------------------------------------------
# create a base dataframe with country identifying variables
country_level_df <- survey_country_mapping

# merge V-Dem data
country_level_df <- country_level_df %>%
  left_join(vdem_2019, by = c("country_name" = "country_name"))

# fix country name in democracy_scores for Czech Republic if needed
if(any(democracy_scores$Country == "Czechia")) {
  democracy_scores$Country[democracy_scores$Country == "Czechia"] <- "Czech Republic"
}

# merge democracy scores
country_level_df <- country_level_df %>%
  left_join(democracy_scores, by = c("iso2" = "ISO2"))

# merge GDP data
country_level_df <- country_level_df %>%
  left_join(df_GDP, by = "iso2")

# rename some countries to match our country_name format
rainbow_country_mapping <- data.frame(
  original = c("Czechia", "Andorra", "Bosnia & Herzegovina", "North Macedonia", "United Kingdom"),
  standardized = c("Czech Republic", "Andorra", "Bosnia and Herzegovina", "Macedonia", "United Kingdom"),
  stringsAsFactors = FALSE)

# apply standardized country names
for(i in 1:nrow(rainbow_country_mapping)) {
  rainbow_df$Country[rainbow_df$Country == rainbow_country_mapping$original[i]] <- 
    rainbow_country_mapping$standardized[i]
}

# extract and rename rainbow map variables for clarity
rainbow_data_clean <- rainbow_df %>%
  select(
    Country,
    rainbow_score_2019 = Value_2019,
    rainbow_score_2018 = Value_2018,
    rainbow_score_avg_2019_2018 = Avg_2019_2018,
    rainbow_score_avg_2013_2014 = Avg_2013_2014,
    rainbow_score_difference = Difference)

# merge Rainbow Map data
country_level_df <- country_level_df %>%
  left_join(rainbow_data_clean, by = c("country_name" = "Country"))

# merge happiness data
country_level_df <- country_level_df %>%
  left_join(happiness_scores, by = c("iso2" = "ISO2"))

# merge unemployment data
country_level_df <- country_level_df %>%
  left_join(eu_unemployment_table, by = c("country_name" = "Country"))

# create a mapping between ESS country codes and ISO2
ess_country_mapping <- data.frame(
  cntry = c("AT", "BE", "BG", "HR", "CY", "CZ", "DK", "EE", "FI", "FR", 
            "DE", "HU", "IE", "IT", "LV", "LT", "NL", "PL", "PT", "RO", 
            "SK", "SI", "ES", "SE", "GB"),
  iso2 = c("AT", "BE", "BG", "HR", "CY", "CZ", "DK", "EE", "FI", "FR", 
           "DE", "HU", "IE", "IT", "LV", "LT", "NL", "PL", "PT", "RO", 
           "SK", "SI", "ES", "SE", "GB"),
  stringsAsFactors = FALSE)

# first ensure cntry codes match our iso2 codes
country_data_final <- country_data_final %>%
  left_join(ess_country_mapping, by = "cntry") %>%
  select(-iso2c, -iso3c) # remove original ISO codes to avoid confusion

# rename variables for clarity
ess_data_clean <- country_data_final %>%
  select(
    cntry,
    n_respondents,
    n_valid,
    lgbt_support_percent = pct_lgbt_support,
    mean_religiosity,
    mean_left_right,
    mean_equal_values,
    mean_country_attach,
    mean_eduyrs,
    mean_age,
    pct_young,
    pct_high_educ,
    se_lgbt_support,
    pct_missing_lgbt,
    age_lgbt_corr,
    relig_lgbt_corr,
    lgbt_support_inequality,
    educ_gradient,
    #iso3c
  )

# merge ESS data
country_level_df <- country_level_df %>%
  left_join(ess_data_clean, by = c("iso2" = "cntry"))

country_level_df %>% View()

# delete the first Greece row
greece_rows <- which(country_level_df$country_name == "Greece")

if (length(greece_rows) > 1) {
  # remove the first instance of Greece
  country_level_df <- country_level_df[-greece_rows[1], ]
  
  # verify the fix worked
  greece_check <- country_level_df %>%
    filter(country_name == "Greece")
  print("Greece entries after removing the first instance:")
  print(greece_check)
}

# add the gender equality index
country_level_df <- country_level_df %>%
  left_join(gender_eq_index, by = "country_name")

## scaling: necessary for the regression analysis later on because the variables have vastly different scales
country_level_df %>% 
  select(where(is.numeric)) %>% 
  summary()
##    v2x_libdem     v2x_polyarchy      v2x_gender      v2x_egaldem    
##  Min.   :0.3680   Min.   :0.4720   Min.   :0.8410   Min.   :0.3340  
##  1st Qu.:0.7143   1st Qu.:0.8105   1st Qu.:0.8908   1st Qu.:0.6977  
##  Median :0.7855   Median :0.8595   Median :0.9220   Median :0.7550  
##  Mean   :0.7386   Mean   :0.8236   Mean   :0.9147   Mean   :0.7195  
##  3rd Qu.:0.8220   3rd Qu.:0.8808   3rd Qu.:0.9445   3rd Qu.:0.7917  
##  Max.   :0.8840   Max.   :0.9140   Max.   :0.9590   Max.   :0.8760  
##                                                                     
##   v2x_liberal       v2xcs_ccsi       v2x_freexp     Overall_score  
##  Min.   :0.7060   Min.   :0.5460   Min.   :0.6470   Min.   :6.380  
##  1st Qu.:0.8718   1st Qu.:0.8395   1st Qu.:0.9113   1st Qu.:7.357  
##  Median :0.9230   Median :0.9125   Median :0.9440   Median :7.790  
##  Mean   :0.8914   Mean   :0.8844   Mean   :0.9088   Mean   :7.886  
##  3rd Qu.:0.9380   3rd Qu.:0.9425   3rd Qu.:0.9702   3rd Qu.:8.568  
##  Max.   :0.9810   Max.   :0.9710   Max.   :0.9840   Max.   :9.390  
##                                                                    
##  Electoral_process_and_pluralism Functioning_of_government
##  Min.   : 8.750                  Min.   :5.360            
##  1st Qu.: 9.170                  1st Qu.:6.340            
##  Median : 9.580                  Median :7.320            
##  Mean   : 9.493                  Mean   :7.373            
##  3rd Qu.: 9.580                  3rd Qu.:8.300            
##  Max.   :10.000                  Max.   :9.640            
##                                                           
##  Political_participation Political_culture Civil_liberties     gdp_2005    
##  Min.   :5.000           Min.   : 4.380    Min.   : 7.060   Min.   : 9602  
##  1st Qu.:6.110           1st Qu.: 6.250    1st Qu.: 8.240   1st Qu.:16572  
##  Median :6.670           Median : 6.880    Median : 8.820   Median :26314  
##  Mean   :6.885           Mean   : 7.034    Mean   : 8.656   Mean   :26425  
##  3rd Qu.:8.330           3rd Qu.: 8.130    3rd Qu.: 9.120   3rd Qu.:32867  
##  Max.   :8.330           Max.   :10.000    Max.   :10.000   Max.   :68708  
##                                                                            
##     gdp_2018        gdp_growth     rainbow_score_2019 rainbow_score_2018
##  Min.   : 24052   Min.   : 19.15   Min.   :16.83      Min.   :16.07     
##  1st Qu.: 32335   1st Qu.: 53.44   1st Qu.:25.69      1st Qu.:28.92     
##  Median : 41545   Median : 68.22   Median :44.77      Median :51.40     
##  Mean   : 45761   Mean   : 83.19   Mean   :42.98      Mean   :49.39     
##  3rd Qu.: 52630   3rd Qu.:114.35   3rd Qu.:57.75      3rd Qu.:67.19     
##  Max.   :116334   Max.   :207.95   Max.   :74.72      Max.   :91.94     
##                                                                         
##  rainbow_score_avg_2019_2018 rainbow_score_avg_2013_2014
##  Min.   :16.45               Min.   :19.65              
##  1st Qu.:26.89               1st Qu.:29.31              
##  Median :48.34               Median :40.52              
##  Mean   :46.19               Mean   :43.86              
##  3rd Qu.:62.90               3rd Qu.:60.91              
##  Max.   :83.33               Max.   :79.38              
##                                                         
##  rainbow_score_difference Happiness_Score  Unemployment      Employment   
##  Min.   :-10.270          Min.   :4.933   Min.   : 2.800   Min.   :52.50  
##  1st Qu.: -6.414          1st Qu.:5.848   1st Qu.: 4.115   1st Qu.:64.97  
##  Median :  0.185          Median :6.173   Median : 5.650   Median :67.40  
##  Mean   :  2.325          Mean   :6.337   Mean   : 6.654   Mean   :67.17  
##  3rd Qu.:  5.274          3rd Qu.:6.938   3rd Qu.: 7.550   3rd Qu.:71.17  
##  Max.   : 37.280          Max.   :7.632   Max.   :19.180   Max.   :77.00  
##                           NA's   :1                                       
##  n_respondents     n_valid     lgbt_support_percent mean_religiosity
##  Min.   : 781   Min.   : 781   Min.   :30.85        Min.   :3.401   
##  1st Qu.:1529   1st Qu.:1529   1st Qu.:55.17        1st Qu.:4.490   
##  Median :1761   Median :1761   Median :75.34        Median :5.248   
##  Mean   :1769   Mean   :1769   Mean   :70.21        Mean   :5.343   
##  3rd Qu.:2200   3rd Qu.:2200   3rd Qu.:88.25        3rd Qu.:6.029   
##  Max.   :2745   Max.   :2745   Max.   :94.25        Max.   :7.513   
##  NA's   :4      NA's   :4      NA's   :4            NA's   :4       
##  mean_left_right  mean_equal_values mean_country_attach  mean_eduyrs   
##  Min.   :0.8134   Min.   :1.718     Min.   :6.974       Min.   :10.97  
##  1st Qu.:1.1858   1st Qu.:2.058     1st Qu.:7.977       1st Qu.:13.40  
##  Median :1.4188   Median :2.164     Median :8.116       Median :13.97  
##  Mean   :1.7429   Mean   :2.301     Mean   :8.177       Mean   :13.96  
##  3rd Qu.:2.1136   3rd Qu.:2.529     3rd Qu.:8.484       3rd Qu.:14.43  
##  Max.   :3.8986   Max.   :3.271     Max.   :9.341       Max.   :16.15  
##  NA's   :4        NA's   :4         NA's   :4           NA's   :4      
##     mean_age       pct_young     pct_high_educ   se_lgbt_support   
##  Min.   :45.46   Min.   :23.93   Min.   :26.32   Min.   :0.005644  
##  1st Qu.:48.48   1st Qu.:26.64   1st Qu.:36.40   1st Qu.:0.007032  
##  Median :49.44   Median :28.34   Median :41.12   Median :0.009516  
##  Mean   :52.31   Mean   :28.25   Mean   :39.98   Mean   :0.010061  
##  3rd Qu.:54.73   3rd Qu.:29.77   3rd Qu.:45.45   3rd Qu.:0.011830  
##  Max.   :68.17   Max.   :34.43   Max.   :52.72   Max.   :0.018152  
##  NA's   :4       NA's   :4       NA's   :4       NA's   :4         
##  pct_missing_lgbt age_lgbt_corr       relig_lgbt_corr   
##  Min.   :0        Min.   :-0.253892   Min.   :-0.27248  
##  1st Qu.:0        1st Qu.:-0.183525   1st Qu.:-0.19308  
##  Median :0        Median :-0.132761   Median :-0.14838  
##  Mean   :0        Mean   :-0.132479   Mean   :-0.14908  
##  3rd Qu.:0        3rd Qu.:-0.084937   3rd Qu.:-0.11943  
##  Max.   :0        Max.   : 0.009616   Max.   :-0.02086  
##  NA's   :4        NA's   :4           NA's   :4         
##  lgbt_support_inequality educ_gradient     gender_equality_index
##  Min.   :0.2294          Min.   :0.01615   Min.   :50.00        
##  1st Qu.:0.3139          1st Qu.:0.04848   1st Qu.:55.55        
##  Median :0.4196          Median :0.07695   Median :60.10        
##  Mean   :0.3966          Mean   :0.08256   Mean   :62.38        
##  3rd Qu.:0.4836          3rd Qu.:0.12288   3rd Qu.:69.25        
##  Max.   :0.5001          Max.   :0.16813   Max.   :82.60        
##  NA's   :4               NA's   :4         NA's   :1
# we'll apply different scaling methods based on the type of variable:
# 1. z-score standardization: for continuous variables to make them comparable (mean=0, sd=1)
# 2. min-max normalization: for variables with natural bounds (e.g., 0-100 scores)
# 3. log transformation: for heavily skewed variables like GDP

# custom function to standardize (z-score)
standardize <- function(x) {
  (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}

# custom function to min-max normalize
normalize <- function(x) {
  (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}

# z-scores for the democracy scores
country_level_df <- country_level_df %>%
  mutate(
    z_v2x_libdem = standardize(v2x_libdem),
    z_v2x_polyarchy = standardize(v2x_polyarchy),
    z_v2x_gender = standardize(v2x_gender),
    z_v2x_egaldem = standardize(v2x_egaldem),
    z_v2x_liberal = standardize(v2x_liberal),
    z_v2xcs_ccsi = standardize(v2xcs_ccsi),
    z_v2x_freexp = standardize(v2x_freexp))

# z-scores for GDP and unemployment rate, natural log for GDP
country_level_df <- country_level_df %>%
  mutate(
    z_gdp_2018 = standardize(gdp_2018),
    log_gdp_2018 = log(gdp_2018),
    z_gdp_growth = standardize(gdp_growth),
    z_unemployment = standardize(Unemployment))

# z-scores and normalised rainbow variables 
country_level_df <- country_level_df %>%
  mutate(
    z_rainbow_score = standardize(rainbow_score_2019),
    norm_rainbow_score = normalize(rainbow_score_2019),
    z_lgbt_support = standardize(lgbt_support_percent),
    norm_lgbt_support = normalize(lgbt_support_percent))

# z-socres and normalised for happiness scores and gender equality
country_level_df <- country_level_df %>%
  mutate(
    z_happiness = standardize(Happiness_Score),
    z_gender_equality = standardize(gender_equality_index),
    norm_gender_equality = normalize(gender_equality_index))

# religion, politics, education and age
country_level_df <- country_level_df %>%
  mutate(
    z_religiosity = standardize(mean_religiosity),
    z_functioning_of_government = standardize(Functioning_of_government),
    z_left_right = standardize(mean_left_right),
    z_equal_values = standardize(mean_equal_values),
    z_country_attach = standardize(mean_country_attach),
    z_eduyrs = standardize(mean_eduyrs),
    z_age = standardize(mean_age))

# create two composite scores and regional classification
country_level_df <- country_level_df %>%
  mutate(
    composite_equality = rowMeans(
      cbind(z_gender_equality, z_rainbow_score, z_v2x_gender),
      na.rm = TRUE))

country_level_df <- country_level_df %>%
  mutate(
    composite_democracy = rowMeans(
      cbind(z_v2x_libdem, z_v2x_polyarchy, z_v2x_libdem, z_v2x_freexp), 
      na.rm = TRUE))

country_level_df <- country_level_df %>%
  mutate(
    region = case_when(
      country_name %in% c("Denmark", "Finland", "Sweden", "Estonia", "Latvia", "Lithuania") ~ 
        "Northern Europe",
      country_name %in% c("Belgium", "Netherlands", "Luxembourg", "Germany", "France", 
                          "Austria", "United Kingdom", "Ireland") ~ 
          "Western Europe",
       country_name %in% c("Portugal", "Spain", "Italy", "Malta", "Greece", "Cyprus") ~ 
        "Southern Europe",
      country_name %in% c("Poland", "Czech Republic", "Slovakia", "Hungary", "Slovenia", 
                          "Croatia", "Romania", "Bulgaria") ~           "Eastern Europe",
        TRUE ~ NA_character_))

# delete the "Country.x", "Country.y" and "CountryName" columns
country_level_df <- country_level_df %>%
  select(-c("Country.x", "Country.y", "CountryName"))

# save as RDS and csv
saveRDS(country_level_df, file = "country_level_df.rds")
write.csv(country_level_df, "country_level_df.csv")

Data cleaning

### data cleaning

library(tidyverse)
library(sjlabelled)
## 
## Attaching package: 'sjlabelled'
## The following object is masked from 'package:forcats':
## 
##     as_factor
## The following object is masked from 'package:dplyr':
## 
##     as_label
## The following object is masked from 'package:ggplot2':
## 
##     as_label
library(haven)
## 
## Attaching package: 'haven'
## The following objects are masked from 'package:sjlabelled':
## 
##     as_factor, read_sas, read_spss, read_stata, write_sas, zap_labels
library(mice)

data <- read_dta("data/raw/ZA7575.dta")

### 
# code the questions properly:
# 'DK' and refusals are especially a problem for questions with ordinal variables, 
# less so for non-ordinal variables; for reasons of completeness, we clean all of
# the 'DK' and refusals (and some other special cases) by converting them to NAs

data_correctly_coded <- data %>%
  mutate(
    # hard cases for me
    d72_1 = ifelse(d72_1 %in% c(5,6), NA, d72_1),
    d72_2 = ifelse(d72_2 %in% c(5,6), NA, d72_2),
    d60 = ifelse(d60 == 7, NA, d60),
    d25 = ifelse(d25 == 8, NA, d25),
    d8 = ifelse(d8 > 70, NA, d8), # subjective decision that no one can have more than 70 years of ed (even full-time professors)
    d7 = ifelse(d7 %in% c(15,97), NA, d7),
    d1 = ifelse(d1 %in% c(97,98), NA, d1),
    
    qa16_1 = ifelse(qa16_1 == 5, NA, qa16_1),
    qa16_2 = ifelse(qa16_2 == 5, NA, qa16_2),
    qa16_3 = ifelse(qa16_3 == 5, NA, qa16_3),
    qa16_4 = ifelse(qa16_4 == 5, NA, qa16_4),
    
    d71_1 = ifelse(d71_1 == 4, NA, d71_1),
    d71_2 = ifelse(d71_2 == 4, NA, d71_2),
    d71_3 = ifelse(d71_3 == 4, NA, d71_3),
    #qb4_1 = ifelse(qb4_1 == 5, NA, qb4_1),
    #qb4_2 = ifelse(qb4_2 == 5, NA, qb4_2),
    #qb4_3 = ifelse(qb4_3 == 5, NA, qb4_3),
    #qb4_4 = ifelse(qb4_4 == 5, NA, qb4_4),
    
    qb3_1 = ifelse(qb3_1 == 5, NA, qb3_1),
    qb3_2 = ifelse(qb3_2 == 5, NA, qb3_2),
    qb3_3 = ifelse(qb3_3 == 5, NA, qb3_3),
    qb3_4 = ifelse(qb3_4 == 5, NA, qb3_4),
    qb3_5 = ifelse(qb3_5 == 5, NA, qb3_5),
    qb3_6 = ifelse(qb3_6 == 5, NA, qb3_6),
    qb3_7 = ifelse(qb3_7 == 5, NA, qb3_7),
    
    qb4_1 = ifelse(qb4_1 == 5, NA, qb4_1),
    qb4_2 = ifelse(qb4_2 == 5, NA, qb4_2),
    qb4_3 = ifelse(qb4_3 == 5, NA, qb4_3),
    qb4_4 = ifelse(qb4_4 == 5, NA, qb4_4),
    qb4_5 = ifelse(qb4_5 == 5, NA, qb4_5),
    
    qb5_1 = ifelse(qb5_1 == 5, NA, qb5_1),
    qb5_2 = ifelse(qb5_2 == 5, NA, qb5_2),
    qb5_3 = ifelse(qb5_3 == 5, NA, qb5_3),
    qb5_4 = ifelse(qb5_4 == 5, NA, qb5_4),
    
    sd1_1 = ifelse(sd1_1 %in% c(3,4), NA, sd1_1),
    sd1_2 = ifelse(sd1_2 %in% c(3,4), NA, sd1_2),
    sd1_3 = ifelse(sd1_3 %in% c(3,4), NA, sd1_3),
    sd1_4 = ifelse(sd1_4 %in% c(3,4), NA, sd1_4),
    sd1_5 = ifelse(sd1_5 %in% c(3,4), NA, sd1_5),
    sd1_6 = ifelse(sd1_6 %in% c(3,4), NA, sd1_6),
    sd1_7 = ifelse(sd1_7 %in% c(3,4), NA, sd1_7),
    sd1_8 = ifelse(sd1_8 %in% c(3,4), NA, sd1_8),
    
    qc1_1 = ifelse(qc1_1 == 6, NA, qc1_1),
    qc1_2 = ifelse(qc1_2 == 6, NA, qc1_2),
    qc1_3 = ifelse(qc1_3 == 6, NA, qc1_3),
    qc1_4 = ifelse(qc1_4 == 6, NA, qc1_4),
    qc1_5 = ifelse(qc1_5 == 6, NA, qc1_5),
    qc1_6 = ifelse(qc1_6 == 6, NA, qc1_6),
    qc1_7 = ifelse(qc1_7 == 6, NA, qc1_7),
    qc1_8 = ifelse(qc1_8 == 6, NA, qc1_8),
    qc1_9 = ifelse(qc1_9 == 6, NA, qc1_9),
    qc1_10 = ifelse(qc1_10 == 6, NA, qc1_10),
    
    qc5_1 = ifelse(qc5_1 == 3, NA, qc5_1),
    qc5_2 = ifelse(qc5_2 == 3, NA, qc5_2),
    qc5_3 = ifelse(qc5_3 == 3, NA, qc5_3),
    qc5_4 = ifelse(qc5_4 == 3, NA, qc5_4),
    
    qc6_1 = ifelse(qc6_1 == 12, NA, qc6_1),
    qc6_2 = ifelse(qc6_2 == 12, NA, qc6_2),
    qc6_3 = ifelse(qc6_3 == 12, NA, qc6_3),
    qc6_4 = ifelse(qc6_4 == 12, NA, qc6_4),
    qc6_5 = ifelse(qc6_5 == 12, NA, qc6_5),
    qc6_6 = ifelse(qc6_6 == 12, NA, qc6_6),
    qc6_7 = ifelse(qc6_7 == 12, NA, qc6_7),
    qc6_8 = ifelse(qc6_8 == 12, NA, qc6_8),
    qc6_9 = ifelse(qc6_9 == 12, NA, qc6_9),
    qc6_10 = ifelse(qc6_10 == 12, NA, qc6_10),
    qc6_11 = ifelse(qc6_11 == 12, NA, qc6_11),
    
    qc9_1 = ifelse(qc9_1 == 7, NA, qc9_1),
    qc9_2 = ifelse(qc9_2 == 7, NA, qc9_2),
    qc9_3 = ifelse(qc9_3 == 7, NA, qc9_3),
    qc9_4 = ifelse(qc9_4 == 7, NA, qc9_4),
    qc9_5 = ifelse(qc9_5 == 7, NA, qc9_5),
    qc9_6 = ifelse(qc9_6 == 7, NA, qc9_6),
    qc9_7 = ifelse(qc9_7 == 7, NA, qc9_7),
    qc9_8 = ifelse(qc9_8 == 7, NA, qc9_8),
    qc9_9 = ifelse(qc9_9 == 7, NA, qc9_9),
    qc9_10 = ifelse(qc9_10 == 7, NA, qc9_10),
    qc9_11 = ifelse(qc9_11 == 7, NA, qc9_11),
    
    qc11_1 = ifelse(qc11_1 == 6, NA, qc11_1),
    qc11_2 = ifelse(qc11_2 == 6, NA, qc11_2),
    qc11_3 = ifelse(qc11_3 == 6, NA, qc11_3),
    qc11_4 = ifelse(qc11_4 == 6, NA, qc11_4),
    qc11_5 = ifelse(qc11_5 == 6, NA, qc11_5),
    qc11_6 = ifelse(qc11_6 == 6, NA, qc11_6),
    
    qc12_1 = ifelse(qc12_1 %in% c(11,12,13), NA, qc12_1),
    qc12_2 = ifelse(qc12_2 %in% c(11,12,13), NA, qc12_2),
    qc12_3 = ifelse(qc12_3 %in% c(11,12,13), NA, qc12_3),
    qc12_4 = ifelse(qc12_4 %in% c(11,12,13), NA, qc12_4),
    qc12_5 = ifelse(qc12_5 %in% c(11,12,13), NA, qc12_5),
    qc12_6 = ifelse(qc12_6 %in% c(11,12,13), NA, qc12_6),
    qc12_7 = ifelse(qc12_7 %in% c(11,12,13), NA, qc12_7),
    qc12_8 = ifelse(qc12_8 %in% c(11,12,13), NA, qc12_8),
    qc12_9 = ifelse(qc12_9 %in% c(11,12,13), NA, qc12_9),
    qc12_10 = ifelse(qc12_10 %in% c(11,12,13), NA, qc12_10),
    qc12_11 = ifelse(qc12_11 %in% c(11,12,13), NA, qc12_11),
    qc12_12 = ifelse(qc12_12 %in% c(11,12,13), NA, qc12_12),
    qc12_13 = ifelse(qc12_13 %in% c(11,12,13), NA, qc12_13),
    qc12_14 = ifelse(qc12_14 %in% c(11,12,13), NA, qc12_14),
    qc12_15 = ifelse(qc12_14 %in% c(11,12,13), NA, qc12_15),
    
    qc13_1 = ifelse(qc13_1 %in% c(11,12,13), NA, qc13_1),
    qc13_2 = ifelse(qc13_2 %in% c(11,12,13), NA, qc13_2),
    qc13_3 = ifelse(qc13_3 %in% c(11,12,13), NA, qc13_3),
    qc13_4 = ifelse(qc13_4 %in% c(11,12,13), NA, qc13_4),
    qc13_5 = ifelse(qc13_5 %in% c(11,12,13), NA, qc13_5),
    qc13_6 = ifelse(qc13_6 %in% c(11,12,13), NA, qc13_6),
    qc13_7 = ifelse(qc13_7 %in% c(11,12,13), NA, qc13_7),
    qc13_8 = ifelse(qc13_8 %in% c(11,12,13), NA, qc13_8),
    qc13_9 = ifelse(qc13_9 %in% c(11,12,13), NA, qc13_9),
    qc13_10 = ifelse(qc13_10 %in% c(11,12,13), NA, qc13_10),
    qc13_11 = ifelse(qc13_11 %in% c(11,12,13), NA, qc13_11),
    qc13_12 = ifelse(qc13_12 %in% c(11,12,13), NA, qc13_12),
    qc13_13 = ifelse(qc13_13 %in% c(11,12,13), NA, qc13_13),
    qc13_14 = ifelse(qc13_14 %in% c(11,12,13), NA, qc13_14),
    qc13_15 = ifelse(qc13_14 %in% c(11,12,13), NA, qc13_15),
    
    qc15_1 = ifelse(qc15_1 == 5, NA, qc15_1),
    qc15_2 = ifelse(qc15_2 == 5, NA, qc15_2),
    qc15_3 = ifelse(qc15_3 == 5, NA, qc15_3),
    
    qc16_1 = ifelse(qc16_1 == 5, NA, qc16_1),
    
    qc17_1 = ifelse(qc17_1 == 5, NA, qc17_1),
    qc17_2 = ifelse(qc17_2 == 5, NA, qc17_2),
    qc17_3 = ifelse(qc17_3 == 5, NA, qc17_3),
    qc17_4 = ifelse(qc17_4 == 5, NA, qc17_4),
    qc17_5 = ifelse(qc17_5 == 5, NA, qc17_5),
    qc17_6 = ifelse(qc17_6 == 5, NA, qc17_6),
    qc17_7 = ifelse(qc17_7 == 5, NA, qc17_7),
    
    qc18_1 = ifelse(qc18_1 %in% c(11,12), NA, qc18_1),
    qc18_2 = ifelse(qc18_2 %in% c(11,12), NA, qc18_2),
    qc18_3 = ifelse(qc18_3 %in% c(11,12), NA, qc18_3),
    
    sd3 = ifelse(sd3 %in% c(15,16), NA, sd3),
    
    # easy cases for Claude
    #q1 = ifelse(q1 %in% c(29,30), NA, q1),
    
    qa1 = ifelse(qa1 == 5, NA, qa1),
    
    qa7 = ifelse(qa7 == 5, NA, qa7),
    
    qa8 = ifelse(qa8 == 4, NA, qa8),
    
    qa9 = ifelse(qa9 == 6, NA, qa9),
    
    qa14 = ifelse(qa14 == 6, NA, qa14),
    
    qa17 = ifelse(qa17 == 5, NA, qa17),
    
    qb6 = ifelse(qb6 == 4, NA, qb6),
    
    qb7 = ifelse(qb7 == 5, NA, qb7),
    
    qc19 = ifelse(qc19 == 3, NA, qc19),
    
    qc20 = ifelse(qc20 == 3, NA, qc20),
    
    d63 = ifelse(d63 %in% c(6,7,8,9), NA, d63), # manually because stupid Claude
    
    d77 = ifelse(d77 == 5, NA, d77),
    
    d70 = ifelse(d70 == 5, NA, d70),
    
    qa4a = ifelse(qa4a %in% c(7,8), NA, qa4a),
    
    qa5a = ifelse(qa5a %in% c(11,12,13), NA, qa5a),
    
    qa11 = ifelse(qa11 %in% c(4,5), NA, qa11),
    
    qa12 = ifelse(qa12 %in% c(5,6), NA, qa12),
    
    qa13 = ifelse(qa13 %in% c(6,7), NA, qa13),
    
    qa18a = ifelse(qa18a %in% c(7,8,9), NA, qa18a),
    
    qb8 = ifelse(qb8 == 5, NA, qb8),
    
    sd3 = ifelse(sd3 == 16, NA, sd3),
    
    qc3 = ifelse(qc3 %in% c(10,11), NA, qc3),
    
    qc7 = ifelse(qc7 %in% c(11,12), NA, qc7),
    
    qc8 = ifelse(qc8 %in% c(11,12), NA, qc8),
    
    qc10 = ifelse(qc10 %in% c(9,10), NA, qc10))

# write_rds(data_correctly_coded, file = "data_correctly_coded.rds")


###
# get rid of all the "r" coded ones
# for the questions with underscores, (1) if they are on a scale, e.g., 1-3 and 'DK'
# is one of them, replace 'DK'; if it's 0-1 coding, keep that (can't do anything about it)

data_correctly_coded <- data_correctly_coded %>%
  select(-matches("_r$|_r[0-9]$|_r[0-9]_")) 
# use setdiff() to check whether it's done properly 


### 
# now analyse the patterns of NAs across columns
missing_data_before <- data_correctly_coded %>%
  summarise(across(everything(), ~sum(is.na(.))/n())) %>%
  pivot_longer(cols = everything(), 
               names_to = "variable", 
               values_to = "missing_proportion") %>%
  arrange(desc(missing_proportion))

# view top variables with missing data
print(head(missing_data_before, 20))
## # A tibble: 20 × 2
##    variable missing_proportion
##    <chr>                 <dbl>
##  1 p6mt                  0.982
##  2 p13mt                 0.982
##  3 p6cy                  0.982
##  4 p7cy                  0.982
##  5 p6lu                  0.981
##  6 p7lu                  0.981
##  7 p13lu                 0.981
##  8 p6hr                  0.964
##  9 p7hr                  0.964
## 10 p6ee                  0.963
## 11 p6fi                  0.963
## 12 p6lt                  0.963
## 13 p7ee                  0.963
## 14 p7fi                  0.963
## 15 p7lt                  0.963
## 16 p13ee                 0.963
## 17 p13fi                 0.963
## 18 p6dk                  0.963
## 19 p7dk                  0.963
## 20 p6es                  0.963
# viz
ggplot(missing_data_before %>% filter(missing_proportion > 0.25), 
       aes(x = reorder(variable, missing_proportion), y = missing_proportion)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variables with >25% Missing Data",
       x = "Variable",
       y = "Proportion Missing") +
  theme_minimal()

# calculate overall proportion of missing data
mean(missing_data_before$missing_proportion)
## [1] 0.1424237
# deeper analysis
missing_by_prefix <- missing_data_before %>%
  mutate(prefix = str_extract(variable, "^[a-z]+\\d+")) %>%
  group_by(prefix) %>%
  summarise(
    avg_missing = mean(missing_proportion),
    n_vars = n(),
    max_missing = max(missing_proportion),
    min_missing = min(missing_proportion)
  ) %>%
  arrange(desc(avg_missing))

print(head(missing_by_prefix, 15))
## # A tibble: 15 × 5
##    prefix avg_missing n_vars max_missing min_missing
##    <chr>        <dbl>  <int>       <dbl>       <dbl>
##  1 p13          0.945      8       0.982      0.779 
##  2 p6           0.931     29       0.982      0     
##  3 p7           0.930     28       0.982      0.0180
##  4 qc3          0.861      1       0.861      0.861 
##  5 qa3          0.679      7       0.679      0.679 
##  6 qa15         0.627      5       0.627      0.627 
##  7 qa2          0.376      8       0.376      0.376 
##  8 d15          0.261      2       0.522      0     
##  9 qb7          0.208      1       0.208      0.208 
## 10 d40          0.157      5       0.784      0     
## 11 qa13         0.124      1       0.124      0.124 
## 12 qc19         0.120      1       0.120      0.120 
## 13 qc16         0.117      1       0.117      0.117 
## 14 qc20         0.116      1       0.116      0.116 
## 15 qc10         0.109      1       0.109      0.109
# we take out the following variables because (1) they exhibited high missingness
# while at the same aren't super releveant (that's an assumption we make) for our
# subsequent analysis
data_correctly_coded <- data_correctly_coded %>%
  select(
    # Exclude variables with specific prefixes
    -starts_with("p6"),   # Size of locality
    -starts_with("p7"),   # Region
    -starts_with("p13"),  # Language of interview
    -starts_with("d40"),  # Household size
    -starts_with("qa3"),  # Not benefitting from trade
    -starts_with("qa15"), # Countries bought goods from
    -starts_with("qa2"),  # Benefitting from trade
    -starts_with("qb8"),  # EU energy label influence
    -starts_with("qa18b"), # Information sources follow-up
    -starts_with("qc3")   # Discrimination circumstances
  )

# check how much we imporved the missingness rate
missing_data_after <- data_correctly_coded %>%
  summarise(across(everything(), ~sum(is.na(.))/n())) %>%
  pivot_longer(cols = everything(), 
               names_to = "variable", 
               values_to = "missing_proportion") %>%
  arrange(desc(missing_proportion))

mean(missing_data_after$missing_proportion)
## [1] 0.02018962
# prepare for imputation
data_correctly_coded <-  data_correctly_coded %>%
  mutate(isocntry = as.factor(isocntry)) %>%
  # remove other problematic variables (like IDs) if needed
  select(!any_of(c("doi", "version", "caseid", "uniqid", "serialid"))) %>%
  sjlabelled::remove_all_labels()

# delete variables with zero variance as they cannot ever be useful predictors
var_zero <- data_correctly_coded %>% 
  select(where(is.numeric)) %>%
  summarise(across(everything(), var, na.rm = TRUE)) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "variance") %>%
  filter(variance == 0 | is.na(variance))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(everything(), var, na.rm = TRUE)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
if(nrow(var_zero) > 0) {
  cat("Removing", nrow(var_zero), "variables with zero variance\n")
  data_correctly_coded <- data_correctly_coded %>%
    select(!any_of(var_zero$variable))
}
## Removing 6 variables with zero variance
data_correctly_coded <- data_correctly_coded %>%
  # convert country codes: collapse East and West Germany into one DE
  mutate(isocntry = case_when(
    isocntry %in% c("DE-W", "DE-E") ~ "DE",
    TRUE ~ isocntry))

###
# run the imputation
set.seed(1212)

start_time <- Sys.time()
#imp_model <- mice(data_correctly_coded, m = 3, method = 'rf', maxit = 3)
end_time <- Sys.time()
end_time - start_time
## Time difference of 0.0007820129 secs
#df_rf_new <- complete(imp_model, action = 3)

#write_rds(df_rf_new, "df_rf_new.rds")

Data cleaning: external datasets

## why still z-scores for mean religosity, left-right  etc. 

country_level_df <- readRDS("country_level_df.rds")

# first, let's check which variables have missing values
missing_summary <- colSums(is.na(country_level_df))
missing_summary <- missing_summary[missing_summary > 0]
print(missing_summary)
##         Happiness_Score           n_respondents                 n_valid 
##                       1                       4                       4 
##    lgbt_support_percent        mean_religiosity         mean_left_right 
##                       4                       4                       4 
##       mean_equal_values     mean_country_attach             mean_eduyrs 
##                       4                       4                       4 
##                mean_age               pct_young           pct_high_educ 
##                       4                       4                       4 
##         se_lgbt_support        pct_missing_lgbt           age_lgbt_corr 
##                       4                       4                       4 
##         relig_lgbt_corr lgbt_support_inequality           educ_gradient 
##                       4                       4                       4 
##   gender_equality_index          z_lgbt_support       norm_lgbt_support 
##                       1                       4                       4 
##             z_happiness       z_gender_equality    norm_gender_equality 
##                       1                       1                       1 
##           z_religiosity            z_left_right          z_equal_values 
##                       4                       4                       4 
##        z_country_attach                z_eduyrs                   z_age 
##                       4                       4                       4
# Visualize missing data patterns
vis_miss(country_level_df)

# we have the following imputation strategy:
# (1) we will have to delete the variables from the ESS because the Eurobarometer
# survey contains similar information and imputing these values didn't seem appropriate
# (2) we chose to impute the happiness score using KNN

# (1) 
# delete ESS variables
ESS_variables <- c(
  # original survey variables
  "n_respondents", "n_valid", "lgbt_support_percent", "mean_religiosity", 
  "mean_left_right", "mean_equal_values", "mean_country_attach", 
  "mean_eduyrs", "mean_age", "pct_young", "pct_high_educ", "se_lgbt_support",
  
  # additional ESS metrics
  "pct_missing_lgbt", "age_lgbt_corr", "relig_lgbt_corr", 
  "lgbt_support_inequality", "educ_gradient",
  
  # z-score transformations of ESS variables
  "z_religiosity", "z_left_right", "z_equal_values", 
  "z_country_attach", "z_eduyrs", "z_age")

country_level_df <- country_level_df %>%
  select(-all_of(ESS_variables))

# (2)
# create separate datasets for variables that need imputation
missing_variables <- names(missing_summary)[missing_summary < 5 & 
                                             !(names(missing_summary) %in% ESS_variables)]

# for our KNN imputation, we need to identify variables to use as predictors
# these should be variables without NAs that also correlate with those we want to impute

# identify complete variables that could serve as predictors
complete_variables <- names(country_level_df)[colSums(is.na(country_level_df)) == 0]
complete_variables <- complete_variables[!complete_variables %in% c("Unnamed: 0", "country_name", "iso2", "region")]

print(complete_variables)
##  [1] "v2x_libdem"                      "v2x_polyarchy"                  
##  [3] "v2x_gender"                      "v2x_egaldem"                    
##  [5] "v2x_liberal"                     "v2xcs_ccsi"                     
##  [7] "v2x_freexp"                      "Overall_score"                  
##  [9] "Electoral_process_and_pluralism" "Functioning_of_government"      
## [11] "Political_participation"         "Political_culture"              
## [13] "Civil_liberties"                 "Regime_type"                    
## [15] "gdp_2005"                        "gdp_2018"                       
## [17] "gdp_growth"                      "rainbow_score_2019"             
## [19] "rainbow_score_2018"              "rainbow_score_avg_2019_2018"    
## [21] "rainbow_score_avg_2013_2014"     "rainbow_score_difference"       
## [23] "Unemployment"                    "Employment"                     
## [25] "z_v2x_libdem"                    "z_v2x_polyarchy"                
## [27] "z_v2x_gender"                    "z_v2x_egaldem"                  
## [29] "z_v2x_liberal"                   "z_v2xcs_ccsi"                   
## [31] "z_v2x_freexp"                    "z_gdp_2018"                     
## [33] "log_gdp_2018"                    "z_gdp_growth"                   
## [35] "z_unemployment"                  "z_rainbow_score"                
## [37] "norm_rainbow_score"              "z_functioning_of_government"    
## [39] "composite_equality"              "composite_democracy"
# custom function to impute variables with KNN, handling non-numeric data
knn_impute <- function(data, vars_to_impute, predictor_vars, k = 5) {
  # create a copy of the data
  imputed_data <- data
  
  # ensure predictor variables are numeric and complete
  pred_data <- data[, predictor_vars, drop = FALSE]
  pred_data <- as.data.frame(lapply(pred_data, function(x) as.numeric(x)))
  
  # remove any non-numeric columns
  numeric_cols <- sapply(pred_data, is.numeric)
  if(any(!numeric_cols)) {
    warning("Removing non-numeric predictor columns: ", 
            paste(names(pred_data)[!numeric_cols], collapse=", "))
    pred_data <- pred_data[, numeric_cols, drop=FALSE]
    predictor_vars <- predictor_vars[numeric_cols]
  }
  
  # handle missing values in predictor variables by using column means
  for(col in names(pred_data)) {
    if(any(is.na(pred_data[[col]]))) {
      pred_data[[col]][is.na(pred_data[[col]])] <- mean(pred_data[[col]], na.rm=TRUE)
    }
  }
  
  # standardize the predictor variables manually to avoid issues
  pred_data_std <- as.data.frame(scale(pred_data))
  
  # for each variable to impute:
  for (var in vars_to_impute) {
    if (var %in% names(data) && sum(is.na(data[[var]])) > 0) {
      # ensure the target variable is numeric
      if (!is.numeric(data[[var]])) {
        cat("Skipping", var, "as it is not numeric\n")
        next
      }
      
      # identify cases with missing values
      missing_indices <- which(is.na(data[[var]]))
      
      # for each missing value:
      for (idx in missing_indices) {
        # calculate Euclidean distances to all other countries
        distances <- numeric(nrow(data))
        
        for (i in 1:nrow(data)) {
          if (i != idx) {
            # calculate squared differences for each predictor
            squared_diffs <- sapply(names(pred_data_std), function(p) {
              (pred_data_std[idx, p] - pred_data_std[i, p])^2
            })
            
            # sum and take square root for Euclidean distance
            distances[i] <- sqrt(sum(squared_diffs, na.rm=TRUE))
          } else {
            distances[i] <- Inf  # don't use the country itself
          }
        }
        
        # find k nearest neighbors with non-missing values for this variable
        valid_neighbors <- which(!is.na(data[[var]]) & !is.infinite(distances))
        
        if (length(valid_neighbors) > 0) {
          # order the neighbors by distance
          neighbor_order <- order(distances[valid_neighbors])
          valid_neighbors <- valid_neighbors[neighbor_order]
          
          # take the k nearest, or as many as available if fewer than k
          k_actual <- min(k, length(valid_neighbors))
          nearest_k <- valid_neighbors[1:k_actual]
          
          # impute as the average of the k nearest neighbors
          imputed_data[idx, var] <- mean(data[nearest_k, var], na.rm = TRUE)
          
          cat("Imputed", var, "for", data$country_name[idx], 
              "using neighbors:", paste(data$country_name[nearest_k], collapse=", "), "\n")
        } else {
          cat("Warning: Could not impute", var, "for", data$country_name[idx], 
              "- no valid neighbors found\n")
        }
      }
    }
  }
  
  return(imputed_data)
}

# now use the function to impute variables with missing values
# first ensure we have the right variable types
country_df_numeric <- country_level_df

for (var in missing_variables) {
  if (var %in% names(country_level_df) && !is.numeric(country_level_df[[var]])) {
    country_df_numeric[[var]] <- as.numeric(as.character(country_level_df[[var]]))
  }
}

# run the imputation with proper error handling
tryCatch({
  country_df_imputed <- knn_impute(
    data = country_df_numeric,
    vars_to_impute = missing_variables,
    predictor_vars = complete_variables,
    k = 3  # using 3 nearest neighbors
  )
  print("Imputation completed successfully!")
}, error = function(e) {
  # if the knn_impute function still fails, let's try an alternative approach
  print(paste("KNN imputation error:", e$message))
  print("Switching to a simpler mean imputation approach...")
  
  country_df_imputed <- country_df_numeric
  for (var in missing_variables) {
    if (sum(is.na(country_df_imputed[[var]])) > 0) {
      # Calculate mean of non-missing values
      var_mean <- mean(country_df_imputed[[var]], na.rm = TRUE)
      # Replace missing values with mean
      country_df_imputed[[var]][is.na(country_df_imputed[[var]])] <- var_mean
      print(paste("Imputed", var, "with mean value:", var_mean))
    }
  }
  return(country_df_imputed)
})
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Imputed Happiness_Score for Belgium using neighbors: France, Portugal, Denmark 
## Imputed gender_equality_index for United Kingdom using neighbors: Netherlands, Austria, France 
## Imputed z_lgbt_support for Luxembourg using neighbors: Ireland, Austria, Germany 
## Imputed z_lgbt_support for Greece using neighbors: Spain, Slovenia, Italy 
## Imputed z_lgbt_support for Malta using neighbors: United Kingdom, Croatia, Austria 
## Imputed z_lgbt_support for Romania using neighbors: Poland, Bulgaria, Croatia 
## Imputed norm_lgbt_support for Luxembourg using neighbors: Ireland, Austria, Germany 
## Imputed norm_lgbt_support for Greece using neighbors: Spain, Slovenia, Italy 
## Imputed norm_lgbt_support for Malta using neighbors: United Kingdom, Croatia, Austria 
## Imputed norm_lgbt_support for Romania using neighbors: Poland, Bulgaria, Croatia 
## Imputed z_happiness for Belgium using neighbors: France, Portugal, Denmark 
## Imputed z_gender_equality for United Kingdom using neighbors: Netherlands, Austria, France 
## Imputed norm_gender_equality for United Kingdom using neighbors: Netherlands, Austria, France 
## [1] "Imputation completed successfully!"
# save
saveRDS(country_df_imputed, file = "country_df_imputed.rds") 
write.csv(country_df_imputed, "country_df_imputed.csv", row.names = FALSE)

Country aggregates

# libraries
library(dplyr)
library(tidyr)
library(readr)

# load the data
df_rf_new <- read_csv("df_rf_new.csv") # the imputed dataset
## New names:
## Rows: 27438 Columns: 475
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): isocntry, nuts dbl (473): ...1, tnscntry, country, d11, d11r1, d11r2,
## d11r3, gen1, gen2, ge...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
country_df_imputed <- readRDS("country_df_imputed.rds") # external dataset

# custom function to calculate country-level aggregates
calculate_country_aggregates <- function(df) {
  country_agg <- df %>%
    group_by(isocntry) %>%
    summarize(
      
      iso2 = first(isocntry),
      
      # demographics
      mean_age = mean(d11, na.rm = TRUE),
      median_age = median(d11, na.rm = TRUE),
      
      # life satisfaction (recoded)
      mean_life_satisfaction = mean(5 - d70, na.rm = TRUE), 
      pct_satisfied = mean(d70 %in% c(1, 2), na.rm = TRUE) * 100,
      
      # political discussions
      mean_natl_political_discuss = mean(d71_1, na.rm = TRUE),
      mean_eu_political_discuss = mean(d71_2, na.rm = TRUE),
      mean_local_political_discuss = mean(d71_3, na.rm = TRUE),
      pct_discuss_natl_politics = mean(d71_1 %in% c(1, 2), na.rm = TRUE) * 100,
      pct_discuss_eu_politics = mean(d71_2 %in% c(1, 2), na.rm = TRUE) * 100,
      pct_discuss_local_politics = mean(d71_3 %in% c(1, 2), na.rm = TRUE) * 100,
      
      # trade and globalization (recoded)
      mean_trade_support = mean(5 - qa1, na.rm = TRUE),
      pct_trade_support = mean(qa1 %in% c(1, 2), na.rm = TRUE) * 100,
      pct_pro_globalization = mean(qa5a %in% c(1, 2, 3, 5, 7, 8), na.rm = TRUE) * 100,
      pct_anti_globalization = mean(qa5a %in% c(4, 6, 9, 10), na.rm = TRUE) * 100,
      mean_eu_trade_support = mean(5 - qa7, na.rm = TRUE),
      pct_support_eu_trade = mean(qa7 %in% c(1, 2), na.rm = TRUE) * 100,
      mean_esg_support = mean(5 - qa9, na.rm = TRUE),
      pct_support_esg = mean(qa9 %in% c(1, 2), na.rm = TRUE) * 100,
      
      # political and social indicators
      mean_left_right = mean(d1, na.rm = TRUE),
      pct_left = mean(d1 %in% c(1, 2, 3, 4), na.rm = TRUE) * 100,
      pct_center = mean(d1 %in% c(5, 6), na.rm = TRUE) * 100,
      pct_right = mean(d1 %in% c(7, 8, 9, 10), na.rm = TRUE) * 100,
      mean_education_years = mean(d8, na.rm = TRUE),
      median_education_years = median(d8, na.rm = TRUE),
      pct_high_education = mean(d8 >= 20, na.rm = TRUE) * 100,
      pct_rural = mean(d25 == 1, na.rm = TRUE) * 100,
      pct_urban = mean(d25 %in% c(2, 3), na.rm = TRUE) * 100,
      pct_large_urban = mean(d25 == 3, na.rm = TRUE) * 100,
      mean_financial_difficulty = mean(4 - d60, na.rm = TRUE),
      pct_financial_difficulty = mean(d60 %in% c(1, 2), na.rm = TRUE) * 100,
      mean_subjective_class = mean(d63, na.rm = TRUE),
      pct_working_class = mean(d63 == 1, na.rm = TRUE) * 100,
      pct_middle_class = mean(d63 %in% c(2, 3, 4), na.rm = TRUE) * 100,
      pct_upper_class = mean(d63 == 5, na.rm = TRUE) * 100,
      mean_voice_in_eu = mean(5 - d72_1, na.rm = TRUE),
      mean_voice_in_country = mean(5 - d72_2, na.rm = TRUE),
      pct_voice_in_eu = mean(d72_1 %in% c(1, 2), na.rm = TRUE) * 100,
      pct_voice_in_country = mean(d72_2 %in% c(1, 2), na.rm = TRUE) * 100,
      
      # diversity indicators
      pct_friends_diff_ethnic = mean(sd1_1 == 1, na.rm = TRUE) * 100,
      pct_friends_diff_skin = mean(sd1_2 == 1, na.rm = TRUE) * 100,
      pct_friends_roma = mean(sd1_3 == 1, na.rm = TRUE) * 100,
      pct_friends_lgbt = mean(sd1_4 == 1, na.rm = TRUE) * 100,
      pct_friends_disabled = mean(sd1_5 == 1, na.rm = TRUE) * 100,
      pct_friends_diff_religion = mean(sd1_6 == 1, na.rm = TRUE) * 100,
      pct_friends_transgender = mean(sd1_7 == 1, na.rm = TRUE) * 100,
      pct_friends_intersex = mean(sd1_8 == 1, na.rm = TRUE) * 100,
      pct_ethnic_minority = mean(sd2_1 == 1, na.rm = TRUE) * 100,
      pct_skin_minority = mean(sd2_2 == 1, na.rm = TRUE) * 100,
      pct_religious_minority = mean(sd2_3 == 1, na.rm = TRUE) * 100,
      pct_roma = mean(sd2_4 == 1, na.rm = TRUE) * 100,
      pct_sexual_minority = mean(sd2_5 == 1, na.rm = TRUE) * 100,
      pct_disability = mean(sd2_6 == 1, na.rm = TRUE) * 100,
      pct_other_minority = mean(sd2_7 == 1, na.rm = TRUE) * 100,
      pct_any_minority = mean(sd2_1 == 1 | sd2_2 == 1 | sd2_3 == 1 | sd2_4 == 1 | 
                                sd2_5 == 1 | sd2_6 == 1 | sd2_7 == 1, na.rm = TRUE) * 100,
      
      # religion
      pct_catholic = mean(sd3 == 1, na.rm = TRUE) * 100,
      pct_orthodox = mean(sd3 == 2, na.rm = TRUE) * 100,
      pct_protestant = mean(sd3 == 3, na.rm = TRUE) * 100,
      pct_other_christian = mean(sd3 == 4, na.rm = TRUE) * 100,
      pct_jewish = mean(sd3 == 5, na.rm = TRUE) * 100,
      pct_muslim = mean(sd3 %in% c(6, 7, 8), na.rm = TRUE) * 100,
      pct_atheist = mean(sd3 == 13, na.rm = TRUE) * 100,
      pct_nonbeliever = mean(sd3 == 14, na.rm = TRUE) * 100,
      pct_nonreligious = mean(sd3 %in% c(13, 14), na.rm = TRUE) * 100,
      
      # subjective discrimination perception (recoded)
      mean_discrim_ethnic = mean(5 - qc1_1, na.rm = TRUE),
      mean_discrim_skin = mean(5 - qc1_2, na.rm = TRUE),
      mean_discrim_roma = mean(5 - qc1_3, na.rm = TRUE),
      mean_discrim_lgbt = mean(5 - qc1_4, na.rm = TRUE),
      mean_discrim_age = mean(5 - qc1_5, na.rm = TRUE),
      mean_discrim_religion = mean(5 - qc1_6, na.rm = TRUE),
      mean_discrim_disability = mean(5 - qc1_7, na.rm = TRUE),
      mean_discrim_transgender = mean(5 - qc1_8, na.rm = TRUE),
      mean_discrim_gender = mean(5 - qc1_9, na.rm = TRUE),
      mean_discrim_intersex = mean(5 - qc1_10, na.rm = TRUE),
      
    )
  return(country_agg)
}

# apply the function
country_aggregates <- calculate_country_aggregates(df_rf_new)

# join 
country_data_combined <- country_df_imputed %>%
  left_join(country_aggregates, by = "iso2")

# while it's less relevant for a ML model such as random forest, for a multi-level 
# regression model, it's important to standardize the values to allow for better
# coefficient interpretation; the goal is to have both z-scores and 0-1 normalised
# values for each of the original variables (except for variables like 'region')

country_data_combined <- country_data_combined %>%
  # first handle all numeric variables that aren't already standardized
  mutate(
    # z-score standardization
    across(where(is.numeric) & 
             !starts_with("z_") & 
             !starts_with("norm_") &
             !matches("country|iso"), 
           list(z = ~scale(.)[,1]),
           .names = "z_{.col}"),
    
    # also create 0-1 normalized versions
    across(where(is.numeric) & 
             !starts_with("z_") & 
             !starts_with("norm_") &
             !matches("country|iso"), 
           list(norm = ~(.-min(., na.rm=TRUE))/(max(., na.rm=TRUE)-min(., na.rm=TRUE))),
           .names = "norm_{.col}"))

# save the combined dataset
saveRDS(country_data_combined, file = "country_data_combined.rds")
write_csv(country_data_combined, "country_data_combined.csv")

# join with df_rf; the actual, imputed survey data
df_rf_enriched_new <- df_rf_new %>%
  # ISO codes as key
  left_join(country_data_combined, by = c("isocntry" = "iso2"))

# save final dataset
write_csv(df_rf_enriched_new, "df_rf_enriched_new.csv")

Descriptive analysis 1

# load libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(viridis)
## Loading required package: viridisLite
## 
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
## 
##     viridis_pal
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
country_level_df <- readRDS("country_level_df.rds")

# plot 1: employment and unemployment Rates
p_1 <- ggplot(country_level_df, aes(x = reorder(country_name, Employment))) +
  geom_bar(aes(y = Employment), stat = "identity", fill = "steelblue", alpha = 0.7) +
  geom_point(aes(y = Unemployment), color = "darkred", size = 3) +
  scale_y_continuous(
    name = "Employment Rate (%)",
    limits = c(0, 100),
    sec.axis = sec_axis(~., name = "Unemployment Rate (%)")
  ) +
  labs(title = "Employment and Unemployment Rates by Country (2018)",
       subtitle = "Bars: Employment rate | Points: Unemployment rate",
       x = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank(),
        axis.title.y.left = element_text(color = "steelblue"),
        axis.title.y.right = element_text(color = "darkred"))

# plot 2: GDP per capita and GDP per capita growth rate
p_2 <- ggplot(country_level_df, aes(x = reorder(country_name, gdp_2018))) +
  geom_bar(aes(y = gdp_2018), stat = "identity", fill = "steelblue", alpha = 0.7) +
  geom_point(aes(y = gdp_growth * 500), color = "darkred", size = 3) +  # Scale growth to fit on same axis
  scale_y_continuous(
    name = "GDP per Capita (USD)",
    labels = comma,
    sec.axis = sec_axis(~./500, name = "GDP Growth 2005-2018 (%)")
  ) +
  labs(title = "GDP per Capita (2018) and Growth Rate",
       subtitle = "Bars: GDP per capita | Points: Growth rate",
       x = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank(),
        axis.title.y.left = element_text(color = "steelblue"),
        axis.title.y.right = element_text(color = "darkred"))

# plot 3: relationship between employment and LGBT support
p_3 <- ggplot(country_level_df, aes(x = Employment, y = lgbt_support_percent)) +
  geom_point(aes(color = Unemployment, size = rainbow_score_2019)) +
  geom_text(aes(label = iso2), hjust = -0.3, vjust = 0, size = 3) +
  scale_color_viridis_c(option = "B", name = "Unemployment\nRate (%)") +
  scale_size_continuous(name = "Rainbow\nScore 2019") +
  labs(title = "Relationship between Employment Rates and LGBT Support",
       subtitle = "Color indicates unemployment rate, size shows level of LGBT legal protection",
       x = "Employment Rate (%)",
       y = "LGBT Support (%)") +
  theme_minimal() +
  theme(panel.grid.minor = element_blank())


# plot 4: relationship between GDP per capita to LGBT support
p_4 <- ggplot(country_level_df, aes(x = gdp_2018, y = lgbt_support_percent)) +
  geom_point(aes(color = rainbow_score_2019, size = gdp_growth)) +
  geom_text(aes(label = iso2), hjust = -0.3, vjust = 0, size = 3) +
  scale_color_viridis_c(option = "E", name = "Rainbow\nScore 2019") +
  scale_size_continuous(name = "GDP Growth\n2005-2018 (%)") +
  scale_x_continuous(labels = comma) +
  labs(title = "Relationship between GDP, LGBT Support and Rights Protections",
       subtitle = "Size indicates GDP growth rate from 2005-2018",
       x = "GDP per Capita (2018, USD)",
       y = "LGBT Support (%)") +
  theme_minimal() +
  theme(panel.grid.minor = element_blank())

# display the plots
p_1

p_2

p_3
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_text()`).

p_4
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 4 rows containing missing values or values outside the scale range
## (`geom_text()`).

Reduced dataset

library(dplyr)
library(readr)
library(kableExtra)

#df_rf_enriched_new <- readRDS("df_rf_enriched_new.rds")

df_reduced <- df_rf_enriched_new %>%
  select(
    # target
    qc19,
    
    # country identifier
    country, country_name, isocntry,
    
    # individual demographics
    age = d11,           
    gender = d10,            
    education = d8, d8r2,      
    occupation = d15a,           
    urban_rural = d25,            
    financial_insecurity = d60,            
    social_class = d63,         
    religion = sd3,            
    political_ideology = d1, 
    lgb_friends = sd1_4,
    trans_friends = sd1_7,
    
    # Personal identity
    ethnic_minority = sd2_1,
    skin_color_minority = sd2_2,
    religious_minority = sd2_3,
    sexual_lgbt_minority = sd2_5,
    disability_minority = sd2_6,
    other_minority = sd2_7,
    none_minority = sd2_8,
    
    # Discrimination
    trans_discrimination_country = qc1_8,
    trans_discrimination_personal = qc2_6,
    trans_discrimination_workplace = qc4_8,
    trans_discrimination_political = qc6_10,
    country_discrimination_efforts = qc7,
    country_discrimination_efforts_recoded = qc7r,
    trans_workplace_diversity = qc9_10,
    trans_colleague = qc12_11,
    trans_colleague_recoded = qc12_11r,
    trans_child_relationship = qc13_11,
    trans_child_relationship_recoded = qc13_11r,
    lgb_rights = qc15_1, 
    same_sex_relationship = qc15_2, 
    same_sex_marriage = qc15_3,
    lgb_school_materials = qc17_3,
    trans_school_materials = qc17_4,
    intersex_school_materials = qc17_5,
    two_men_public_affection = qc18_2,
    two_men_public_affection_recoded = qc18_2r,
    two_women_public_affection = qc18_3,
    two_women_public_affection_recoded = qc18_3r,
    non_gendered_docs = qc20,
    
    # Country-level variables
    gdp_2018,                # Economic development
    rainbow_score_2019,      # LGBTI rights/protections
    gender_equality_index,   # Gender equality
    Happiness_Score,         # National well-being
    gdp_2018,                # Economic development
    rainbow_score_2019,      # LGBTI rights/protections
    rainbow_score_2018,      # LGBTI rights/protections (previous year)
    rainbow_score_avg_2019_2018, # Average LGBTI rights/protections
    gender_equality_index,   # Gender equality
    Happiness_Score,         # National well-being
    v2x_libdem,              # Democratic quality
    v2x_egaldem,             # Democratic quality
    Regime_type,             # Democratic quality
    z_functioning_of_government, #Govenment function measure
    composite_equality,     # Equality measure
    z_composite_equality,   # Standardized equality measure
    norm_composite_equality, # Normalized equality measure
    z_lgbt_support,         # Standardized LGBT support
    norm_lgbt_support,      # Normalized LGBT support
    pct_friends_lgbt,       # Percentage of LGBT friends
    z_pct_friends_lgbt,     # Standardized % of LGBT friends
    norm_pct_friends_lgbt,  # Normalized % of LGBT friends
    mean_left_right,         # Average political leaning
    pct_high_education,      # Education level
    region,                  # Geographic region
    
    # Paradata
    interview_date = p1,                      
    interview_start_time = p2,                      
    interview_duration = p3,                      
    interview_duration_recoded = p3r,                     
    people_present_during_interview = p4,                      
    respondent_cooperation = p5                       
  )

# Save to a new file
write_rds(df_reduced, "df_reduced.rds")

Descriptive analysis 2

# packages
library(tidyverse)
library(corrplot)
## corrplot 0.95 loaded
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggrepel)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
library(DALEX) 
## Welcome to DALEX (version: 2.4.3).
## Find examples and detailed introduction at: http://ema.drwhy.ai/
## 
## Attaching package: 'DALEX'
## The following object is masked from 'package:dplyr':
## 
##     explain
library(ranger) 
## 
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
## 
##     importance
# load datasets
df_reduced <- readRDS("df_reduced.rds")
country_data_combined <- readRDS("country_data_combined.rds")

# first calculate country aggregates for individual variables so that we have
# continuous data for which we can more easily calculate correlations

df_reduced_aggregated <- df_reduced %>%
  group_by(country_name, isocntry) %>%
  summarize(
    
    # target
    pct_support_trans = mean(qc19 == 1, na.rm = TRUE) * 100,
    pct_oppose_trans = mean(qc19 == 2, na.rm = TRUE) * 100,
    pct_dk_trans = mean(qc19 == 3, na.rm = TRUE) * 100,
    
    # demographics
    mean_age = mean(age, na.rm = TRUE),
    pct_female = mean(gender == 2, na.rm = TRUE) * 100,
    mean_education_years = mean(education, na.rm = TRUE),
    pct_high_educ = mean(d8r2 >= 3, na.rm = TRUE) * 100,
    pct_rural = mean(urban_rural == 1, na.rm = TRUE) * 100,
    pct_urban = mean(urban_rural == 2, na.rm = TRUE) * 100,
    pct_large_urban = mean(urban_rural == 3, na.rm = TRUE) * 100,
    pct_financial_difficulty = mean(financial_insecurity <= 2, na.rm = TRUE) * 100,
    pct_working_class = mean(social_class == 1, na.rm = TRUE) * 100,
    pct_middle_class = mean(social_class %in% c(2,3,4), na.rm = TRUE) * 100,
    pct_upper_class = mean(social_class == 5, na.rm = TRUE) * 100,
    
    # values and identity
    mean_political_ideology = mean(political_ideology, na.rm = TRUE),
    pct_left = mean(political_ideology <= 3, na.rm = TRUE) * 100,
    pct_center = mean(political_ideology > 3 & political_ideology < 8, na.rm = TRUE) * 100,
    pct_right = mean(political_ideology >= 8, na.rm = TRUE) * 100,
    
    # religious composition
    pct_religious = mean(religion %in% c(1:11), na.rm = TRUE) * 100,
    pct_catholic = mean(religion == 1, na.rm = TRUE) * 100,
    pct_orthodox = mean(religion == 2, na.rm = TRUE) * 100,
    pct_protestant = mean(religion == 3, na.rm = TRUE) * 100,
    pct_other_christian = mean(religion == 4, na.rm = TRUE) * 100,
    pct_muslim = mean(religion %in% c(6:8), na.rm = TRUE) * 100,
    pct_other_religion = mean(religion %in% c(9:11), na.rm = TRUE) * 100,
    pct_atheist = mean(religion == 13, na.rm = TRUE) * 100,
    pct_nonbeliever = mean(religion == 14, na.rm = TRUE) * 100,
    pct_nonreligious = mean(religion %in% c(13,14), na.rm = TRUE) * 100,
    
    # social networks and contact
    pct_lgb_friends = mean(lgb_friends == 1, na.rm = TRUE) * 100,
    pct_trans_friends = mean(trans_friends == 1, na.rm = TRUE) * 100,
    
    # personal identity/minority status
    pct_ethnic_minority = mean(ethnic_minority == 1, na.rm = TRUE) * 100,
    pct_skin_color_minority = mean(skin_color_minority == 1, na.rm = TRUE) * 100,
    pct_religious_minority = mean(religious_minority == 1, na.rm = TRUE) * 100,
    pct_sexual_minority = mean(sexual_lgbt_minority == 1, na.rm = TRUE) * 100,
    pct_disability_minority = mean(disability_minority == 1, na.rm = TRUE) * 100,
    pct_other_minority = mean(other_minority == 1, na.rm = TRUE) * 100,
    pct_no_minority = mean(none_minority == 1, na.rm = TRUE) * 100,
    
    # discrimination perceptions; we need to be careful with interpretation/ coding
    mean_trans_discrim_country = mean(trans_discrimination_country, na.rm = TRUE),
    pct_perceive_trans_discrim = mean(trans_discrimination_country %in% c(1,2), na.rm = TRUE) * 100,
    pct_experienced_trans_discrim = mean(trans_discrimination_personal == 1, na.rm = TRUE) * 100,
    pct_workplace_trans_discrim = mean(trans_discrimination_workplace == 1, na.rm = TRUE) * 100,
    
    # political representation comfort - original scale (1-10, higher = more comfortable)
    mean_trans_political_comfort = mean(trans_discrimination_political, na.rm = TRUE),
    pct_comfortable_trans_political = mean(trans_discrimination_political >= 7, na.rm = TRUE) * 100,
    
    # effectiveness of discrimination efforts (original scale: 1-10, higher = more effective)
    mean_country_discrimination_efforts = mean(country_discrimination_efforts, na.rm = TRUE),
    
    # workplace diversity (1=Yes definitely, 4=No definitely not)
    mean_trans_workplace_diversity = mean(trans_workplace_diversity, na.rm = TRUE),
    pct_support_trans_workplace_diversity = mean(trans_workplace_diversity %in% c(1,2), na.rm = TRUE) * 100,
    
    # LGBT attitudes - REVERSED for these scales where 1=Totally agree, 4=Totally disagree
    # Create reversed versions so higher = more supportive
    mean_lgb_rights_rev = mean(case_when(
      lgb_rights == 1 ~ 4,
      lgb_rights == 2 ~ 3,
      lgb_rights == 3 ~ 2,
      lgb_rights == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    mean_same_sex_relationship_rev = mean(case_when(
      same_sex_relationship == 1 ~ 4,
      same_sex_relationship == 2 ~ 3,
      same_sex_relationship == 3 ~ 2,
      same_sex_relationship == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    mean_same_sex_marriage_rev = mean(case_when(
      same_sex_marriage == 1 ~ 4,
      same_sex_marriage == 2 ~ 3,
      same_sex_marriage == 3 ~ 2,
      same_sex_marriage == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    # also calculate percentage who agree with LGBT rights (original values 1 or 2)
    pct_support_lgb_rights = mean(lgb_rights %in% c(1,2), na.rm = TRUE) * 100,
    pct_support_same_sex_relationship = mean(same_sex_relationship %in% c(1,2), na.rm = TRUE) * 100,
    pct_support_same_sex_marriage = mean(same_sex_marriage %in% c(1,2), na.rm = TRUE) * 100,
    
    # school materials (1=Totally agree, 4=Totally disagree)
    # Create reversed versions so higher = more supportive
    mean_lgb_school_materials_rev = mean(case_when(
      lgb_school_materials == 1 ~ 4,
      lgb_school_materials == 2 ~ 3,
      lgb_school_materials == 3 ~ 2,
      lgb_school_materials == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    mean_trans_school_materials_rev = mean(case_when(
      trans_school_materials == 1 ~ 4,
      trans_school_materials == 2 ~ 3,
      trans_school_materials == 3 ~ 2,
      trans_school_materials == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    mean_intersex_school_materials_rev = mean(case_when(
      intersex_school_materials == 1 ~ 4,
      intersex_school_materials == 2 ~ 3,
      intersex_school_materials == 3 ~ 2,
      intersex_school_materials == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    # also calculate percentage who agree with inclusive school materials
    pct_support_lgb_school_materials = mean(lgb_school_materials %in% c(1,2), na.rm = TRUE) * 100,
    pct_support_trans_school_materials = mean(trans_school_materials %in% c(1,2), na.rm = TRUE) * 100,
    pct_support_intersex_school_materials = mean(intersex_school_materials %in% c(1,2), na.rm = TRUE) * 100,
    
    # comfort with same-sex public displays of affection (1-10)
    mean_men_pda_comfort = mean(two_men_public_affection, na.rm = TRUE),
    mean_women_pda_comfort = mean(two_women_public_affection, na.rm = TRUE),
    pct_comfortable_men_pda = mean(two_men_public_affection >= 7, na.rm = TRUE) * 100,
    pct_comfortable_women_pda = mean(two_women_public_affection >= 7, na.rm = TRUE) * 100,
    
    # comfort with transgender colleagues/children (1-10)
    mean_trans_colleague_comfort = mean(trans_colleague, na.rm = TRUE),
    mean_trans_child_relationship = mean(trans_child_relationship, na.rm = TRUE),
    pct_comfortable_trans_colleague = mean(trans_colleague >= 7, na.rm = TRUE) * 100,
    pct_comfortable_trans_child = mean(trans_child_relationship >= 7, na.rm = TRUE) * 100,
    
    # support for non-gendered documents
    pct_support_nongendered_docs = mean(non_gendered_docs == 1, na.rm = TRUE) * 100,
    pct_oppose_nongendered_docs = mean(non_gendered_docs == 2, na.rm = TRUE) * 100,
    pct_dk_nongendered_docs = mean(non_gendered_docs == 3, na.rm = TRUE) * 100) %>%
  ungroup()
## `summarise()` has grouped output by 'country_name'. You can override using the
## `.groups` argument.
# add country-level variables from the original dataset
# extract country-level variables that don't need to be aggregated
country_level_variables <- df_reduced %>%
  select(country_name, gdp_2018, rainbow_score_2019, rainbow_score_2018, 
         gender_equality_index, Happiness_Score, 
         v2x_libdem, v2x_egaldem, Regime_type, composite_equality, 
         z_composite_equality, norm_composite_equality, z_lgbt_support, 
         norm_lgbt_support, pct_friends_lgbt, z_pct_friends_lgbt, 
         norm_pct_friends_lgbt, mean_left_right, pct_high_education, region) %>%
  distinct()

# merge with aggregated data
df_reduced_aggregated_complete <- df_reduced_aggregated %>%
  left_join(country_level_variables, by = "country_name")

# what we could add
setdiff(colnames(country_data_combined), country_level_variables)
##   [1] "country_name"                        
##   [2] "iso2"                                
##   [3] "v2x_libdem"                          
##   [4] "v2x_polyarchy"                       
##   [5] "v2x_gender"                          
##   [6] "v2x_egaldem"                         
##   [7] "v2x_liberal"                         
##   [8] "v2xcs_ccsi"                          
##   [9] "v2x_freexp"                          
##  [10] "Overall_score"                       
##  [11] "Electoral_process_and_pluralism"     
##  [12] "Functioning_of_government"           
##  [13] "Political_participation"             
##  [14] "Political_culture"                   
##  [15] "Civil_liberties"                     
##  [16] "Regime_type"                         
##  [17] "gdp_2005"                            
##  [18] "gdp_2018"                            
##  [19] "gdp_growth"                          
##  [20] "rainbow_score_2019"                  
##  [21] "rainbow_score_2018"                  
##  [22] "rainbow_score_avg_2019_2018"         
##  [23] "rainbow_score_avg_2013_2014"         
##  [24] "rainbow_score_difference"            
##  [25] "Happiness_Score"                     
##  [26] "Unemployment"                        
##  [27] "Employment"                          
##  [28] "gender_equality_index"               
##  [29] "z_v2x_libdem"                        
##  [30] "z_v2x_polyarchy"                     
##  [31] "z_v2x_gender"                        
##  [32] "z_v2x_egaldem"                       
##  [33] "z_v2x_liberal"                       
##  [34] "z_v2xcs_ccsi"                        
##  [35] "z_v2x_freexp"                        
##  [36] "z_gdp_2018"                          
##  [37] "log_gdp_2018"                        
##  [38] "z_gdp_growth"                        
##  [39] "z_unemployment"                      
##  [40] "z_rainbow_score"                     
##  [41] "norm_rainbow_score"                  
##  [42] "z_lgbt_support"                      
##  [43] "norm_lgbt_support"                   
##  [44] "z_happiness"                         
##  [45] "z_gender_equality"                   
##  [46] "norm_gender_equality"                
##  [47] "z_functioning_of_government"         
##  [48] "composite_equality"                  
##  [49] "composite_democracy"                 
##  [50] "region"                              
##  [51] "isocntry"                            
##  [52] "mean_age"                            
##  [53] "median_age"                          
##  [54] "mean_life_satisfaction"              
##  [55] "pct_satisfied"                       
##  [56] "mean_natl_political_discuss"         
##  [57] "mean_eu_political_discuss"           
##  [58] "mean_local_political_discuss"        
##  [59] "pct_discuss_natl_politics"           
##  [60] "pct_discuss_eu_politics"             
##  [61] "pct_discuss_local_politics"          
##  [62] "mean_trade_support"                  
##  [63] "pct_trade_support"                   
##  [64] "pct_pro_globalization"               
##  [65] "pct_anti_globalization"              
##  [66] "mean_eu_trade_support"               
##  [67] "pct_support_eu_trade"                
##  [68] "mean_esg_support"                    
##  [69] "pct_support_esg"                     
##  [70] "mean_left_right"                     
##  [71] "pct_left"                            
##  [72] "pct_center"                          
##  [73] "pct_right"                           
##  [74] "mean_education_years"                
##  [75] "median_education_years"              
##  [76] "pct_high_education"                  
##  [77] "pct_rural"                           
##  [78] "pct_urban"                           
##  [79] "pct_large_urban"                     
##  [80] "mean_financial_difficulty"           
##  [81] "pct_financial_difficulty"            
##  [82] "mean_subjective_class"               
##  [83] "pct_working_class"                   
##  [84] "pct_middle_class"                    
##  [85] "pct_upper_class"                     
##  [86] "mean_voice_in_eu"                    
##  [87] "mean_voice_in_country"               
##  [88] "pct_voice_in_eu"                     
##  [89] "pct_voice_in_country"                
##  [90] "pct_friends_diff_ethnic"             
##  [91] "pct_friends_diff_skin"               
##  [92] "pct_friends_roma"                    
##  [93] "pct_friends_lgbt"                    
##  [94] "pct_friends_disabled"                
##  [95] "pct_friends_diff_religion"           
##  [96] "pct_friends_transgender"             
##  [97] "pct_friends_intersex"                
##  [98] "pct_ethnic_minority"                 
##  [99] "pct_skin_minority"                   
## [100] "pct_religious_minority"              
## [101] "pct_roma"                            
## [102] "pct_sexual_minority"                 
## [103] "pct_disability"                      
## [104] "pct_other_minority"                  
## [105] "pct_any_minority"                    
## [106] "pct_catholic"                        
## [107] "pct_orthodox"                        
## [108] "pct_protestant"                      
## [109] "pct_other_christian"                 
## [110] "pct_jewish"                          
## [111] "pct_muslim"                          
## [112] "pct_atheist"                         
## [113] "pct_nonbeliever"                     
## [114] "pct_nonreligious"                    
## [115] "mean_discrim_ethnic"                 
## [116] "mean_discrim_skin"                   
## [117] "mean_discrim_roma"                   
## [118] "mean_discrim_lgbt"                   
## [119] "mean_discrim_age"                    
## [120] "mean_discrim_religion"               
## [121] "mean_discrim_disability"             
## [122] "mean_discrim_transgender"            
## [123] "mean_discrim_gender"                 
## [124] "mean_discrim_intersex"               
## [125] "z_Overall_score"                     
## [126] "z_Electoral_process_and_pluralism"   
## [127] "z_Functioning_of_government"         
## [128] "z_Political_participation"           
## [129] "z_Political_culture"                 
## [130] "z_Civil_liberties"                   
## [131] "z_gdp_2005"                          
## [132] "z_rainbow_score_2019"                
## [133] "z_rainbow_score_2018"                
## [134] "z_rainbow_score_avg_2019_2018"       
## [135] "z_rainbow_score_avg_2013_2014"       
## [136] "z_rainbow_score_difference"          
## [137] "z_Happiness_Score"                   
## [138] "z_Unemployment"                      
## [139] "z_Employment"                        
## [140] "z_gender_equality_index"             
## [141] "z_log_gdp_2018"                      
## [142] "z_composite_equality"                
## [143] "z_composite_democracy"               
## [144] "z_mean_age"                          
## [145] "z_median_age"                        
## [146] "z_mean_life_satisfaction"            
## [147] "z_pct_satisfied"                     
## [148] "z_mean_natl_political_discuss"       
## [149] "z_mean_eu_political_discuss"         
## [150] "z_mean_local_political_discuss"      
## [151] "z_pct_discuss_natl_politics"         
## [152] "z_pct_discuss_eu_politics"           
## [153] "z_pct_discuss_local_politics"        
## [154] "z_mean_trade_support"                
## [155] "z_pct_trade_support"                 
## [156] "z_pct_pro_globalization"             
## [157] "z_pct_anti_globalization"            
## [158] "z_mean_eu_trade_support"             
## [159] "z_pct_support_eu_trade"              
## [160] "z_mean_esg_support"                  
## [161] "z_pct_support_esg"                   
## [162] "z_mean_left_right"                   
## [163] "z_pct_left"                          
## [164] "z_pct_center"                        
## [165] "z_pct_right"                         
## [166] "z_mean_education_years"              
## [167] "z_median_education_years"            
## [168] "z_pct_high_education"                
## [169] "z_pct_rural"                         
## [170] "z_pct_urban"                         
## [171] "z_pct_large_urban"                   
## [172] "z_mean_financial_difficulty"         
## [173] "z_pct_financial_difficulty"          
## [174] "z_mean_subjective_class"             
## [175] "z_pct_working_class"                 
## [176] "z_pct_middle_class"                  
## [177] "z_pct_upper_class"                   
## [178] "z_mean_voice_in_eu"                  
## [179] "z_pct_voice_in_eu"                   
## [180] "z_pct_friends_diff_ethnic"           
## [181] "z_pct_friends_diff_skin"             
## [182] "z_pct_friends_roma"                  
## [183] "z_pct_friends_lgbt"                  
## [184] "z_pct_friends_disabled"              
## [185] "z_pct_friends_diff_religion"         
## [186] "z_pct_friends_transgender"           
## [187] "z_pct_friends_intersex"              
## [188] "z_pct_ethnic_minority"               
## [189] "z_pct_skin_minority"                 
## [190] "z_pct_religious_minority"            
## [191] "z_pct_roma"                          
## [192] "z_pct_sexual_minority"               
## [193] "z_pct_disability"                    
## [194] "z_pct_other_minority"                
## [195] "z_pct_any_minority"                  
## [196] "z_pct_catholic"                      
## [197] "z_pct_orthodox"                      
## [198] "z_pct_protestant"                    
## [199] "z_pct_other_christian"               
## [200] "z_pct_jewish"                        
## [201] "z_pct_muslim"                        
## [202] "z_pct_atheist"                       
## [203] "z_pct_nonbeliever"                   
## [204] "z_pct_nonreligious"                  
## [205] "z_mean_discrim_ethnic"               
## [206] "z_mean_discrim_skin"                 
## [207] "z_mean_discrim_roma"                 
## [208] "z_mean_discrim_lgbt"                 
## [209] "z_mean_discrim_age"                  
## [210] "z_mean_discrim_religion"             
## [211] "z_mean_discrim_disability"           
## [212] "z_mean_discrim_transgender"          
## [213] "z_mean_discrim_gender"               
## [214] "z_mean_discrim_intersex"             
## [215] "norm_v2x_libdem"                     
## [216] "norm_v2x_polyarchy"                  
## [217] "norm_v2x_gender"                     
## [218] "norm_v2x_egaldem"                    
## [219] "norm_v2x_liberal"                    
## [220] "norm_v2xcs_ccsi"                     
## [221] "norm_v2x_freexp"                     
## [222] "norm_Overall_score"                  
## [223] "norm_Electoral_process_and_pluralism"
## [224] "norm_Functioning_of_government"      
## [225] "norm_Political_participation"        
## [226] "norm_Political_culture"              
## [227] "norm_Civil_liberties"                
## [228] "norm_gdp_2005"                       
## [229] "norm_gdp_2018"                       
## [230] "norm_gdp_growth"                     
## [231] "norm_rainbow_score_2019"             
## [232] "norm_rainbow_score_2018"             
## [233] "norm_rainbow_score_avg_2019_2018"    
## [234] "norm_rainbow_score_avg_2013_2014"    
## [235] "norm_rainbow_score_difference"       
## [236] "norm_Happiness_Score"                
## [237] "norm_Unemployment"                   
## [238] "norm_Employment"                     
## [239] "norm_gender_equality_index"          
## [240] "norm_log_gdp_2018"                   
## [241] "norm_composite_equality"             
## [242] "norm_composite_democracy"            
## [243] "norm_mean_age"                       
## [244] "norm_median_age"                     
## [245] "norm_mean_life_satisfaction"         
## [246] "norm_pct_satisfied"                  
## [247] "norm_mean_natl_political_discuss"    
## [248] "norm_mean_eu_political_discuss"      
## [249] "norm_mean_local_political_discuss"   
## [250] "norm_pct_discuss_natl_politics"      
## [251] "norm_pct_discuss_eu_politics"        
## [252] "norm_pct_discuss_local_politics"     
## [253] "norm_mean_trade_support"             
## [254] "norm_pct_trade_support"              
## [255] "norm_pct_pro_globalization"          
## [256] "norm_pct_anti_globalization"         
## [257] "norm_mean_eu_trade_support"          
## [258] "norm_pct_support_eu_trade"           
## [259] "norm_mean_esg_support"               
## [260] "norm_pct_support_esg"                
## [261] "norm_mean_left_right"                
## [262] "norm_pct_left"                       
## [263] "norm_pct_center"                     
## [264] "norm_pct_right"                      
## [265] "norm_mean_education_years"           
## [266] "norm_median_education_years"         
## [267] "norm_pct_high_education"             
## [268] "norm_pct_rural"                      
## [269] "norm_pct_urban"                      
## [270] "norm_pct_large_urban"                
## [271] "norm_mean_financial_difficulty"      
## [272] "norm_pct_financial_difficulty"       
## [273] "norm_mean_subjective_class"          
## [274] "norm_pct_working_class"              
## [275] "norm_pct_middle_class"               
## [276] "norm_pct_upper_class"                
## [277] "norm_mean_voice_in_eu"               
## [278] "norm_pct_voice_in_eu"                
## [279] "norm_pct_friends_diff_ethnic"        
## [280] "norm_pct_friends_diff_skin"          
## [281] "norm_pct_friends_roma"               
## [282] "norm_pct_friends_lgbt"               
## [283] "norm_pct_friends_disabled"           
## [284] "norm_pct_friends_diff_religion"      
## [285] "norm_pct_friends_transgender"        
## [286] "norm_pct_friends_intersex"           
## [287] "norm_pct_ethnic_minority"            
## [288] "norm_pct_skin_minority"              
## [289] "norm_pct_religious_minority"         
## [290] "norm_pct_roma"                       
## [291] "norm_pct_sexual_minority"            
## [292] "norm_pct_disability"                 
## [293] "norm_pct_other_minority"             
## [294] "norm_pct_any_minority"               
## [295] "norm_pct_catholic"                   
## [296] "norm_pct_orthodox"                   
## [297] "norm_pct_protestant"                 
## [298] "norm_pct_other_christian"            
## [299] "norm_pct_jewish"                     
## [300] "norm_pct_muslim"                     
## [301] "norm_pct_atheist"                    
## [302] "norm_pct_nonbeliever"                
## [303] "norm_pct_nonreligious"               
## [304] "norm_mean_discrim_ethnic"            
## [305] "norm_mean_discrim_skin"              
## [306] "norm_mean_discrim_roma"              
## [307] "norm_mean_discrim_lgbt"              
## [308] "norm_mean_discrim_age"               
## [309] "norm_mean_discrim_religion"          
## [310] "norm_mean_discrim_disability"        
## [311] "norm_mean_discrim_transgender"       
## [312] "norm_mean_discrim_gender"            
## [313] "norm_mean_discrim_intersex"
country_data_combined_add <- country_data_combined %>%
  select(iso2, norm_gdp_growth, norm_Unemployment, pct_trade_support, norm_gdp_2018,
         z_gender_equality, z_rainbow_score, z_v2x_libdem, z_v2x_egaldem, z_happiness,
         norm_gdp_growth, z_v2x_gender)

df_reduced_aggregated_complete <- df_reduced_aggregated_complete %>%
  left_join(country_data_combined_add, by = c("isocntry" = "iso2"))

# Check the final dataset
dim(df_reduced_aggregated_complete)
## [1]  28 100
# save it
write_rds(df_reduced_aggregated_complete, "df_reduced_aggregated_complete")


### create different plots
# calculate correlation matrix for numeric variables
cor_matrix <- cor(select_if(df_reduced_aggregated_complete, is.numeric), 
                  use = "pairwise.complete.obs")
## Warning in cor(select_if(df_reduced_aggregated_complete, is.numeric), use =
## "pairwise.complete.obs"): the standard deviation is zero
# (1) visualize complete correlation matrix
corrplot(cor_matrix, method = "color", type = "upper", 
         diag = F,
         tl.col = "black", tl.srt = 45, 
         number.cex = 0.7, tl.cex = 0.7)

# (2) create custom function to analyze correlations with outcome
analyze_correlations_with_outcome <- function(data, outcome_var, min_correlation = 0.3) {
  # numeric columns only
  numeric_data <- data %>% select(where(is.numeric))
  
  # calculate correlations with outcome
  cors <- cor(numeric_data, numeric_data[[outcome_var]], use = "pairwise.complete.obs")
  
  # convert to data frame
  cor_df <- data.frame(
    variable = rownames(cors),
    correlation = cors[,1]) %>%
    # remove the outcome variable correlating with itself
    filter(variable != outcome_var) %>%
    # sort by absolute correlation
    arrange(desc(abs(correlation)))
  
  # filter by minimum correlation threshold
  cor_df <- cor_df %>% filter(abs(correlation) >= min_correlation)
  
  return(cor_df)
}

# run correlation analysis for transgender support
trans_support_correlations <- analyze_correlations_with_outcome(
  df_reduced_aggregated_complete, "pct_support_trans", min_correlation = 0.25)
## Warning in cor(numeric_data, numeric_data[[outcome_var]], use =
## "pairwise.complete.obs"): the standard deviation is zero
# display results
print(trans_support_correlations)
##                                                                    variable
## pct_oppose_trans                                           pct_oppose_trans
## pct_support_nongendered_docs                   pct_support_nongendered_docs
## pct_oppose_nongendered_docs                     pct_oppose_nongendered_docs
## pct_support_lgb_school_materials           pct_support_lgb_school_materials
## z_lgbt_support                                               z_lgbt_support
## norm_lgbt_support                                         norm_lgbt_support
## pct_support_lgb_rights                               pct_support_lgb_rights
## mean_lgb_rights_rev                                     mean_lgb_rights_rev
## pct_support_intersex_school_materials pct_support_intersex_school_materials
## pct_support_trans_school_materials       pct_support_trans_school_materials
## pct_support_same_sex_marriage                 pct_support_same_sex_marriage
## pct_support_same_sex_relationship         pct_support_same_sex_relationship
## pct_comfortable_trans_colleague             pct_comfortable_trans_colleague
## mean_trans_colleague_comfort                   mean_trans_colleague_comfort
## mean_lgb_school_materials_rev                 mean_lgb_school_materials_rev
## mean_same_sex_marriage_rev                       mean_same_sex_marriage_rev
## mean_same_sex_relationship_rev               mean_same_sex_relationship_rev
## mean_intersex_school_materials_rev       mean_intersex_school_materials_rev
## mean_trans_school_materials_rev             mean_trans_school_materials_rev
## mean_men_pda_comfort                                   mean_men_pda_comfort
## norm_pct_friends_lgbt                                 norm_pct_friends_lgbt
## pct_lgb_friends                                             pct_lgb_friends
## pct_friends_lgbt                                           pct_friends_lgbt
## z_pct_friends_lgbt                                       z_pct_friends_lgbt
## pct_comfortable_men_pda                             pct_comfortable_men_pda
## mean_women_pda_comfort                               mean_women_pda_comfort
## pct_comfortable_trans_political             pct_comfortable_trans_political
## pct_comfortable_women_pda                         pct_comfortable_women_pda
## mean_trans_political_comfort                   mean_trans_political_comfort
## mean_trans_child_relationship                 mean_trans_child_relationship
## pct_comfortable_trans_child                     pct_comfortable_trans_child
## z_v2x_egaldem                                                 z_v2x_egaldem
## v2x_egaldem                                                     v2x_egaldem
## composite_equality                                       composite_equality
## z_composite_equality                                   z_composite_equality
## norm_composite_equality                             norm_composite_equality
## rainbow_score_2018                                       rainbow_score_2018
## z_v2x_libdem                                                   z_v2x_libdem
## v2x_libdem                                                       v2x_libdem
## rainbow_score_2019                                       rainbow_score_2019
## z_rainbow_score                                             z_rainbow_score
## z_gender_equality                                         z_gender_equality
## gender_equality_index                                 gender_equality_index
## pct_right                                                         pct_right
## pct_trans_friends                                         pct_trans_friends
## mean_political_ideology                             mean_political_ideology
## mean_left_right                                             mean_left_right
## Happiness_Score                                             Happiness_Score
## z_happiness                                                     z_happiness
## norm_gdp_2018                                                 norm_gdp_2018
## gdp_2018                                                           gdp_2018
## pct_center                                                       pct_center
## norm_gdp_growth                                             norm_gdp_growth
## mean_trans_discrim_country                       mean_trans_discrim_country
## pct_perceive_trans_discrim                       pct_perceive_trans_discrim
## pct_high_education                                       pct_high_education
## pct_high_educ                                                 pct_high_educ
## pct_workplace_trans_discrim                     pct_workplace_trans_discrim
## pct_financial_difficulty                           pct_financial_difficulty
## pct_female                                                       pct_female
## pct_left                                                           pct_left
## mean_age                                                           mean_age
## pct_atheist                                                     pct_atheist
## pct_trade_support                                         pct_trade_support
## z_v2x_gender                                                   z_v2x_gender
## pct_orthodox                                                   pct_orthodox
## pct_religious                                                 pct_religious
## pct_nonreligious                                           pct_nonreligious
## pct_experienced_trans_discrim                 pct_experienced_trans_discrim
## pct_urban                                                         pct_urban
## pct_support_trans_workplace_diversity pct_support_trans_workplace_diversity
## pct_no_minority                                             pct_no_minority
## pct_large_urban                                             pct_large_urban
## pct_protestant                                               pct_protestant
## pct_other_religion                                       pct_other_religion
## pct_ethnic_minority                                     pct_ethnic_minority
## mean_country_discrimination_efforts     mean_country_discrimination_efforts
##                                       correlation
## pct_oppose_trans                       -1.0000000
## pct_support_nongendered_docs            0.9552632
## pct_oppose_nongendered_docs            -0.9552632
## pct_support_lgb_school_materials        0.9099915
## z_lgbt_support                          0.9055394
## norm_lgbt_support                       0.9055394
## pct_support_lgb_rights                  0.9054333
## mean_lgb_rights_rev                     0.8873981
## pct_support_intersex_school_materials   0.8853638
## pct_support_trans_school_materials      0.8828781
## pct_support_same_sex_marriage           0.8820719
## pct_support_same_sex_relationship       0.8799193
## pct_comfortable_trans_colleague         0.8795309
## mean_trans_colleague_comfort            0.8697944
## mean_lgb_school_materials_rev           0.8679107
## mean_same_sex_marriage_rev              0.8669420
## mean_same_sex_relationship_rev          0.8656658
## mean_intersex_school_materials_rev      0.8479568
## mean_trans_school_materials_rev         0.8459165
## mean_men_pda_comfort                    0.8398369
## norm_pct_friends_lgbt                   0.8350491
## pct_lgb_friends                         0.8350491
## pct_friends_lgbt                        0.8350491
## z_pct_friends_lgbt                      0.8350491
## pct_comfortable_men_pda                 0.8313883
## mean_women_pda_comfort                  0.8219693
## pct_comfortable_trans_political         0.8144761
## pct_comfortable_women_pda               0.8135901
## mean_trans_political_comfort            0.8080428
## mean_trans_child_relationship           0.8008226
## pct_comfortable_trans_child             0.7938314
## z_v2x_egaldem                           0.7909474
## v2x_egaldem                             0.7909474
## composite_equality                      0.7752634
## z_composite_equality                    0.7752634
## norm_composite_equality                 0.7752634
## rainbow_score_2018                      0.7539898
## z_v2x_libdem                            0.7147897
## v2x_libdem                              0.7147897
## rainbow_score_2019                      0.7094428
## z_rainbow_score                         0.7094428
## z_gender_equality                       0.7035475
## gender_equality_index                   0.7035475
## pct_right                              -0.6890892
## pct_trans_friends                       0.6648839
## mean_political_ideology                -0.6422553
## mean_left_right                        -0.6422553
## Happiness_Score                         0.6310032
## z_happiness                             0.6310032
## norm_gdp_2018                           0.5771817
## gdp_2018                                0.5771817
## pct_center                              0.5523491
## norm_gdp_growth                        -0.5522688
## mean_trans_discrim_country             -0.5172765
## pct_perceive_trans_discrim              0.5008079
## pct_high_education                      0.4980007
## pct_high_educ                           0.4948097
## pct_workplace_trans_discrim             0.4664769
## pct_financial_difficulty               -0.4504279
## pct_female                             -0.4496147
## pct_left                                0.4253939
## mean_age                                0.4017389
## pct_atheist                             0.4001147
## pct_trade_support                       0.3991507
## z_v2x_gender                            0.3977959
## pct_orthodox                           -0.3976753
## pct_religious                          -0.3825894
## pct_nonreligious                        0.3724599
## pct_experienced_trans_discrim          -0.3707182
## pct_urban                               0.3530711
## pct_support_trans_workplace_diversity   0.3494914
## pct_no_minority                         0.3146637
## pct_large_urban                        -0.3090152
## pct_protestant                          0.3082064
## pct_other_religion                      0.2785021
## pct_ethnic_minority                    -0.2752098
## mean_country_discrimination_efforts     0.2604523
# create a visualization of top correlations
ggplot(trans_support_correlations %>% head(15), 
       aes(x = reorder(variable, correlation), y = correlation)) +
  geom_bar(stat = "identity", aes(fill = correlation > 0)) +
  coord_flip() +
  labs(title = "Top correlated variables of transgender rights support",
       subtitle = "Country-level aggregated data",
       x = "Variables",
       y = "Correlation with % supporting transgender document changes") +
  theme_minimal() +
  scale_fill_manual(values = c("firebrick", "steelblue"), 
                    name = "Direction",
                    labels = c("Negative", "Positive"))

# (3) create correlation matrix for key variables
key_vars <- c("pct_support_trans", 
              trans_support_correlations$variable[1:25], # top 25 correlates
              "gdp_2018", "rainbow_score_2019", "gender_equality_index")

key_cor_matrix <- cor(df_reduced_aggregated_complete[, key_vars], 
                      use = "pairwise.complete.obs")

corrplot(key_cor_matrix, method = "color",
         order = "hclust", diag = F,
         tl.col = "black", tl.srt = 45, 
         number.cex = 0.6, tl.cex = 0.7)

# (4) create correlation matrix for all country variables
key_vars_country <- c("pct_support_trans", "norm_gdp_2018", "rainbow_score_2019",
              "z_gender_equality", "norm_Unemployment", "norm_gdp_growth", 
              "z_v2x_libdem", "z_v2x_egaldem","z_happiness", "z_v2x_gender")

key_cor_matrix_country <- cor(df_reduced_aggregated_complete[, key_vars_country], 
                      use = "pairwise.complete.obs")

corrplot(key_cor_matrix_country, method = "color",
         order = "hclust", diag = F, addCoef.col = "black",
         tl.col = "black", tl.srt = 45, 
         number.cex = 0.6, tl.cex = 0.7)

### 
# regional comparisons
region_summary <- df_reduced_aggregated_complete %>%
  group_by(region) %>%
  summarize(
    n_countries = n(),
    avg_trans_support = mean(pct_support_trans),
    min_support = min(pct_support_trans),
    max_support = max(pct_support_trans),
    std_dev = sd(pct_support_trans)) %>%
  arrange(desc(avg_trans_support))

# visualize regional differences
ggplot(df_reduced_aggregated_complete, aes(x = reorder(region, pct_support_trans, FUN = mean), 
                                           y = pct_support_trans)) +
  geom_boxplot(aes(fill = region)) +
  geom_jitter(width = 0.2, alpha = 0.5) +
  coord_flip() +
  labs(title = "Support for Transgender Rights by European Region",
       x = "Region",
       y = "% Supporting Transgender Rights") +
  theme_minimal() +
  theme(legend.position = "none")

### some regression
# create a scatterplot matrix with regression lines
ggplot(df_reduced_aggregated_complete, 
       aes(x = pct_nonreligious, y = pct_support_trans)) +
  geom_point(aes(size = rainbow_score_2019, color = region)) +
  geom_text_repel(aes(label = country_name), size = 3, max.overlaps = 10) +
  geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
  labs(title = "Relationship Between Non-religiosity and Transgender Rights Support",
       x = "% Non-religious Population",
       y = "% Supporting Transgender Rights",
       size = "Rainbow Score",
       color = "Region") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(df_reduced_aggregated_complete, 
       aes(x = gender_equality_index, y = pct_support_trans)) +
  geom_point(aes(size = rainbow_score_2019, color = region)) +
  geom_text_repel(aes(label = country_name), size = 3, max.overlaps = 10) +
  geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
  labs(title = "Gender Equality and Transgender Rights Support",
       x = "Gender Equality Index",
       y = "% Supporting Transgender Rights",
       size = "Rainbow Score",
       color = "Region") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

### clusters
cluster_vars <- c("pct_support_trans", "norm_gdp_2018", 
                  "rainbow_score_2019", "gender_equality_index", 
                  "v2x_libdem", "pct_catholic", "pct_nonreligious")

# Prepare data
cluster_data <- df_reduced_aggregated_complete %>%
  select(country_name, all_of(cluster_vars)) %>%
  na.omit() %>%
  column_to_rownames("country_name")

# Scale the data
cluster_data_scaled <- scale(cluster_data)

# Compute distance matrix
dist_matrix <- dist(cluster_data_scaled, method = "euclidean")

# Hierarchical clustering
hc <- hclust(dist_matrix, method = "ward.D2")

# Plot dendrogram
plot(hc, main = "Clustering of Countries by Support for Transgender Rights",
     sub = "", xlab = "", cex = 0.7)
rect.hclust(hc, k = 4, border = "red")

### summary table
country_summary <- df_reduced_aggregated_complete %>%
  select(country_name, pct_support_trans, rainbow_score_2019, 
         gender_equality_index, norm_gdp_2018,
         pct_nonreligious, v2x_libdem, gdp_2018) %>%
  arrange(desc(pct_support_trans))

kable(country_summary, 
      col.names = c("Country", "Trans Rights Support (%)", "Rainbow Score", 
                    "Gender Equality", "LGBT Comfort", 
                    "Non-religious (%)", "Liberal Democracy", "GDP 2018"),
      digits = 1) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(0, bold = TRUE) %>%
  add_header_above(c(" " = 1, "Support Measures" = 2, "Equality Measures" = 2, 
                     "Social Factors" = 2, "Economic" = 1))
Support Measures
Equality Measures
Social Factors
Economic
Country Trans Rights Support (%) Rainbow Score Gender Equality LGBT Comfort Non-religious (%) Liberal Democracy GDP 2018
Spain 89.1 58.5 68.3 0.2 21.7 0.8 41073.7
Netherlands 86.8 49.7 72.9 0.4 47.6 0.8 58818.0
Malta 82.2 74.7 60.1 0.3 4.8 0.6 48127.7
Denmark 79.5 60.3 76.8 0.4 14.2 0.9 57231.3
France 79.0 62.7 72.6 0.2 22.0 0.8 46397.5
Luxembourg 79.0 41.3 69.0 1.0 20.8 0.8 116334.0
Germany 78.9 47.5 65.5 0.3 28.1 0.8 56272.9
Portugal 75.4 57.5 56.0 0.1 9.7 0.8 34725.2
Ireland 74.1 44.7 69.5 0.7 10.0 0.8 86433.7
Sweden 73.1 52.9 82.6 0.3 37.2 0.9 53122.5
Belgium 73.0 70.4 70.5 0.3 24.9 0.8 52466.5
Finland 71.9 62.2 73.0 0.3 17.0 0.8 49242.7
United Kingdom 70.8 67.6 69.6 0.2 29.4 0.8 47108.0
Austria 61.8 48.5 63.3 0.4 21.5 0.8 56654.5
Greece 59.9 44.8 50.0 0.1 1.8 0.8 29792.0
Estonia 59.1 34.4 56.7 0.1 36.4 0.8 37201.1
Cyprus 53.5 22.0 55.1 0.2 2.0 0.7 40922.7
Slovenia 52.8 39.8 68.4 0.2 6.8 0.7 38656.2
Italy 49.9 19.4 62.1 0.2 12.2 0.8 43583.4
Poland 47.2 18.1 56.8 0.1 4.6 0.5 32360.9
Latvia 46.5 16.8 57.9 0.1 20.1 0.7 29832.1
Croatia 44.8 45.8 53.1 0.1 7.3 0.6 29043.9
Czech Republic 43.8 25.5 53.6 0.2 48.2 0.7 42016.4
Lithuania 41.7 19.9 56.8 0.1 7.0 0.8 36491.9
Slovakia 28.2 29.0 52.4 0.1 8.5 0.7 31514.2
Romania 23.6 20.7 52.4 0.1 1.6 0.6 29569.6
Bulgaria 16.7 25.8 58.0 0.0 4.3 0.5 24052.2
Hungary 16.2 42.8 50.8 0.1 17.4 0.4 32258.3
### check some bivariate relationships; choose three key demographic variblaes
# (1) age
ggplot(df_reduced, aes(x = age, y = as.numeric(qc19 == 1))) +
  geom_jitter(alpha = 0.1, height = 0.05) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(title = "Relationship Between Age and Transgender Rights Support",
       x = "Age", y = "Probability of Support") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# looks like age has a non-linear effect as the curve is flat for ages until ~60 and then falls off
# this suggests we should use a quadratic term for age

# (2) political ideology
ggplot(df_reduced, aes(x = political_ideology, y = as.numeric(qc19 == 1))) +
  geom_jitter(alpha = 0.1, height = 0.05) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(title = "Relationship Between Political Ideology and Transgender Rights Support",
       x = "Political Ideology (Left to Right)", 
       y = "Probability of Support") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# (3a) religion
ggplot(df_reduced_aggregated_complete, aes(x = pct_nonreligious, y = pct_support_trans)) +
  geom_point(aes(size = rainbow_score_2019, color = region), alpha = 0.8) +
  geom_text_repel(aes(label = country_name), size = 3, max.overlaps = 15) +
  geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
  labs(title = "Relationship Between Non-religiosity and Transgender Rights Support",
       subtitle = "Country-level data from Special Eurobarometer 493 (2019)",
       x = "% Non-religious Population",
       y = "% Supporting Transgender Document Changes",
       size = "Rainbow Score",
       color = "Region") +
  theme_minimal() 
## `geom_smooth()` using formula = 'y ~ x'

# (3b) religion disaggregated
religion_support <- df_reduced %>%
  mutate(religion_group = case_when(
    religion == 1 ~ "Catholic",
    religion == 2 ~ "Orthodox",
    religion == 3 ~ "Protestant",
    religion == 4 ~ "Other Christian",
    religion %in% c(6:8) ~ "Muslim",
    religion %in% c(9:11) ~ "Other Religion",
    religion == 13 ~ "Atheist",
    religion == 14 ~ "Non-believer",
    TRUE ~ "Other/Missing")) %>%
  group_by(religion_group) %>%
  summarize(
    support_rate = mean(qc19 == 1, na.rm = TRUE) * 100,
    sample_size = n(),
    se = sqrt((support_rate/100 * (1-support_rate/100)) / sample_size) * 100) %>%
  filter(religion_group != "Other/Missing", sample_size >= 30) %>%
  arrange(desc(support_rate))

ggplot(religion_support, aes(x = reorder(religion_group, support_rate), y = support_rate)) +
  geom_bar(stat = "identity", fill = "steelblue", width = 0.7) +
  geom_errorbar(aes(ymin = support_rate - 1.96*se, 
                    ymax = support_rate + 1.96*se),
                width = 0.2) +
  geom_text(aes(label = sprintf("%.1f%%", support_rate), y = support_rate + 3),
            vjust = 0) +
  geom_text(aes(label = paste0("n=", sample_size), y = -5),
            vjust = 1, size = 3) +
  coord_flip() +
  labs(title = "Support for Transgender Rights by Religious Affiliation",
       subtitle = "Percentage supporting transgender document changes by religion",
       x = NULL,
       y = "Support Rate (%)") +
  ylim(-5, max(religion_support$support_rate) + 10) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold"),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text.y = element_text(face = "bold"))

### intermediate conclusion:
# based on the results, we should use the following variables in the multilevel
# regression model

# country-level variables: 
# z_rainbow_score, z_gender_equality, regions, z_v2x_libdem, z_v2z_egaldem, 
# norm_gdp_2018


### use ML for predictor selection (country variables)
# prepare data for random forest by selecting variables we considered before
df_reduced$region <- as.factor(df_reduced$region)
df_reduced$Regime_type <- as.factor(df_reduced$Regime_type)
df_reduced$qc19 <- ifelse(df_reduced$qc19 == 1, 1, ifelse(df_reduced$qc19 == 2, 0, NA)) %>%
  as.factor() # convert target to binary

# remove NAs due to the coding before
df_reduced <- df_reduced %>%
  na.omit()

# join
df_reduced_rf <- df_reduced %>%
  left_join(country_data_combined, by = "country_name") 

model_data <- df_reduced_rf %>%
  select(region.x, z_v2x_libdem, z_v2x_egaldem, z_v2x_freexp, z_v2x_gender, z_v2x_liberal,
         z_gender_equality_index, z_rainbow_score_2019, norm_gdp_2018, norm_Unemployment,
         norm_Happiness_Score, norm_gdp_growth, z_Functioning_of_government,
         z_Electoral_process_and_pluralism, qc19)

# split data into training and testing
set.seed(69420)
train_index <- createDataPartition(model_data$qc19, p = 0.7, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]

# train random forest
rf_model <- ranger(
  qc19 ~ ., # Exclude country as a predictor
  data = train_data,
  importance = "impurity", # Gini importance
  num.trees = 500,
  mtry = floor(sqrt(ncol(train_data) - 2)), # Default mtry for classification
  probability = TRUE)

# extract variable importance
var_importance <- importance(rf_model)
var_importance_df <- data.frame(
  Variable = names(var_importance),
  Importance = var_importance)
var_importance_df <- var_importance_df[order(var_importance_df$Importance, decreasing = TRUE), ]

# Plot variable importance
ggplot(var_importance_df, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Variable Importance from Random Forest",
       x = "Variables",
       y = "Importance")